Back to Open Assistant

Dataset Creation

data/datasets/grade_school_math_instructions/dataset_creation.ipynb

0.0.13.7 KB
Original Source
python
import pandas as pd
from pathlib import Path
import pandas as pd
import requests
import numpy as np
import random

data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

templates = [
    "{problem}",
    "{problem}",
    "{problem}",
    "{problem}",
    "{problem}",
    """Solve the following math problem: {problem}""",
    """Provide a step by step solution for the following math problem: {problem}""",
    """{problem}
How to solve this?""",
    """{problem}
Can you solve this problem?""",
    """I need help with this problem:
{problem}""",
    """{problem}
What is the solution?""",
    """{problem}
Give me a solution to this problem""",
    """{problem}
Solve it. """,
    """{problem}
Solve this problem. """,
    """{problem}
Find the solution. """,
    """{problem}
Give me a solution to this problem""",
    """Solve the math problem: {problem}""",
    """Find the answer to this math problem: {problem}""",
    """Explain how to solve this math problem: {problem}""",
    """{problem}
Work out the solution step by step. """,
    """{problem}
Give me a detailed solution. """,
    """Find a solution for this math problem: {problem}""",
    """Break down this math problem: {problem}""",
    """{problem}
Give me a clear explanation. """,
    "Find the answer to the math problem: {problem}",
    "Can you explain how to solve this math problem: {problem}",
    "Please show me the solution for: {problem}",
    """I'm stuck on this math problem: {problem}
Can you help?""",
    "Can you guide me through solving this problem: {problem}",
    "I need a clearer understanding of how to solve: {problem}",
    "Can you walk me through the solution of: {problem}",
    "Can you provide an in-depth solution for: {problem}"
    "Hey there, could you help me solve this math problem: {problem}",
    "Can you give me some step-by-step instructions for this math problem: {problem}",
    """I'm completely lost with this math problem: {problem}
Can you give me a hand?""",
    """This math problem has got me stumped: {problem}
Can you show me the way?""",
    """I would love to understand how to solve this problem: {problem}
Can you explain?""",
    "Can you break down the solution for me for this math problem: {problem}",
]
python
def download_original(name):
    with requests.get(
        f"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/{name}"
    ) as response:
        with open(data_dir / name, "w") as f:
            f.write(response.text)


def load_df(name):
    with open(data_dir / name) as f:
        df = pd.read_json(f, lines=True)

    return pd.DataFrame(
        {
            "INSTRUCTION": df.apply(lambda x: np.random.choice(templates).format(problem=x["question"]), axis=1),
            "RESPONSE": df["answer"].str.replace(r"<<.*>>|\n####.*", ""),
            "SOURCE": "grade-school-math",
        }
    )


def save_result(df, name):
    df.to_parquet(data_dir / f"{name.split('.')[0]}.parquet", row_group_size=100, engine="pyarrow")
python
# dataset_names = ["train.jsonl", "test.jsonl", "train_socratic.jsonl", "test_socratic.jsonl"]
dataset_names = ["train.jsonl", "test.jsonl"]
for name in dataset_names:
    download_original(name)
python
df = pd.concat([load_df(name) for name in dataset_names], ignore_index=True)
python
df
python
ind = random.randint(0, len(df))
print(df.iloc[ind]["INSTRUCTION"])
print()
print(df.iloc[ind]["RESPONSE"])
python
df.to_parquet(str(data_dir / "output.parquet"), row_group_size=100, engine="pyarrow")
python
from datasets import Dataset

ds = Dataset.from_parquet(str((data_dir / "output.parquet").absolute()))
ds.push_to_hub("qwedsacf/grade-school-math-instructions")