data/datasets/grade_school_math_instructions/dataset_creation.ipynb
import pandas as pd
from pathlib import Path
import pandas as pd
import requests
import numpy as np
import random
data_dir = Path("data")
data_dir.mkdir(exist_ok=True)
templates = [
"{problem}",
"{problem}",
"{problem}",
"{problem}",
"{problem}",
"""Solve the following math problem: {problem}""",
"""Provide a step by step solution for the following math problem: {problem}""",
"""{problem}
How to solve this?""",
"""{problem}
Can you solve this problem?""",
"""I need help with this problem:
{problem}""",
"""{problem}
What is the solution?""",
"""{problem}
Give me a solution to this problem""",
"""{problem}
Solve it. """,
"""{problem}
Solve this problem. """,
"""{problem}
Find the solution. """,
"""{problem}
Give me a solution to this problem""",
"""Solve the math problem: {problem}""",
"""Find the answer to this math problem: {problem}""",
"""Explain how to solve this math problem: {problem}""",
"""{problem}
Work out the solution step by step. """,
"""{problem}
Give me a detailed solution. """,
"""Find a solution for this math problem: {problem}""",
"""Break down this math problem: {problem}""",
"""{problem}
Give me a clear explanation. """,
"Find the answer to the math problem: {problem}",
"Can you explain how to solve this math problem: {problem}",
"Please show me the solution for: {problem}",
"""I'm stuck on this math problem: {problem}
Can you help?""",
"Can you guide me through solving this problem: {problem}",
"I need a clearer understanding of how to solve: {problem}",
"Can you walk me through the solution of: {problem}",
"Can you provide an in-depth solution for: {problem}"
"Hey there, could you help me solve this math problem: {problem}",
"Can you give me some step-by-step instructions for this math problem: {problem}",
"""I'm completely lost with this math problem: {problem}
Can you give me a hand?""",
"""This math problem has got me stumped: {problem}
Can you show me the way?""",
"""I would love to understand how to solve this problem: {problem}
Can you explain?""",
"Can you break down the solution for me for this math problem: {problem}",
]
def download_original(name):
with requests.get(
f"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/{name}"
) as response:
with open(data_dir / name, "w") as f:
f.write(response.text)
def load_df(name):
with open(data_dir / name) as f:
df = pd.read_json(f, lines=True)
return pd.DataFrame(
{
"INSTRUCTION": df.apply(lambda x: np.random.choice(templates).format(problem=x["question"]), axis=1),
"RESPONSE": df["answer"].str.replace(r"<<.*>>|\n####.*", ""),
"SOURCE": "grade-school-math",
}
)
def save_result(df, name):
df.to_parquet(data_dir / f"{name.split('.')[0]}.parquet", row_group_size=100, engine="pyarrow")
# dataset_names = ["train.jsonl", "test.jsonl", "train_socratic.jsonl", "test_socratic.jsonl"]
dataset_names = ["train.jsonl", "test.jsonl"]
for name in dataset_names:
download_original(name)
df = pd.concat([load_df(name) for name in dataset_names], ignore_index=True)
df
ind = random.randint(0, len(df))
print(df.iloc[ind]["INSTRUCTION"])
print()
print(df.iloc[ind]["RESPONSE"])
df.to_parquet(str(data_dir / "output.parquet"), row_group_size=100, engine="pyarrow")
from datasets import Dataset
ds = Dataset.from_parquet(str((data_dir / "output.parquet").absolute()))
ds.push_to_hub("qwedsacf/grade-school-math-instructions")