data/datasets/recipes/tasty_recipes.ipynb
Takes this Kaggle dataset 'Recipes from Tasty' https://www.kaggle.com/datasets/zeeenb/recipes-from-tasty?select=ingredient_and_instructions.json, and turns them into basic dialogue using a preset list of user prompt templates.
ONE_STEP_TEMPLATES = [
"How do I cook {title}?",
"How do I make {title}?",
"How do you make {title}?",
"Help me make {title}.",
"Tell me how to make {title}.",
"How do I prepare {title}?",
"Could you tell me how to prepare {title}?",
"Have you got a recipe for {title}?",
"Do you have a recipe for {title}?",
"Could I have the recipe for {title}?",
"Do you know how to make {title}?",
"How do I go about making {title}?",
"Can you tell me how to make {title}?",
]
# TWO_STEP_TEMPLATES_1 = ["What ingredients do I need to make {title}?","What ingredients do I need to cook {title}?","What do I need to make {title}?","What do I need to cook {title}?"]
# TWO_STEP_TEMPLATES_2 = ["What are the steps?","How do I prepare it?","How do I cook it?","How can I cook it?"]
import os
import kaggle
import pandas as pd
import json
import random
import unicodedata
import re
from fractions import Fraction
from IPython.display import display
from datasets import Dataset
data_source = "https://www.kaggle.com/datasets/zeeenb/recipes-from-tasty"
output_dir = "data"
os.makedirs(output_dir, exist_ok=True)
# Convert fraction unicode characters to strings (e.g. ½ -> 1/2)
def convert_fraction_unicode_chars_to_strings(string):
output = ""
i = 0
while i < len(string):
char = string[i]
try:
if unicodedata.name(char).startswith("VULGAR FRACTION"): # check if the character is a fraction
val = unicodedata.numeric(char)
# if the current character is a fraction, find the end of the fraction
j = i + 1
while j < len(string):
next_char = string[j]
if not unicodedata.name(next_char).startswith(
"VULGAR FRACTION"
): # break if next character is not a fraction
break
next_val = unicodedata.numeric(next_char)
val = val * 10 + next_val
j += 1
# convert the numeric value to a Fraction object and then to a string with a maximum of 2 digits
fraction = str(Fraction(val).limit_denominator(100))
output += fraction
i = j
else:
# if the current character is not a fraction, simply add it to the output
output += char
i += 1
except ValueError:
# if the character does not have a name, skip it
i += 1
return output
kaggle.api.dataset_download_files("zeeenb/recipes-from-tasty", "data", unzip=True)
dishes = pd.read_csv("data/dishes.csv", usecols=["language", "name", "slug"])
# Remove non-English recipes
dishes = dishes[dishes["language"] == "eng"]
# Open ingredient_and_instructions.json and extract instructions
ingredient_and_instructions = json.load(open("data/ingredient_and_instructions.json"))
# Create dataframe with columns INSTRUCTION, RESPONSE, SOURCE
# The INSTRUCTION a random choice from ONE_STEP_TEMPLATES with the title of the recipe filled in
# The RESPONSE is the ingredients and instructions for the recipe concatenated
# The SOURCE is the recipe title
recipes = []
for index, row in dishes.iterrows():
recipe_name = row["name"]
# Remove the following phrases from the recipe name (ignoring case)
recipe_name = re.sub("How to Make ", "", recipe_name, flags=re.IGNORECASE)
# Concatenate ingredients from ingredient_and_instructions[row["slug"]]["ingredients_sections"]
ingredients = ""
for section in ingredient_and_instructions[row["slug"]]["ingredient_sections"]:
if section["name"] != "":
ingredients += f"\n{section['name']}\n"
for ingredient in section["ingredients"]:
primary_unit_quantity = ""
if ingredient["primary_unit"]["quantity"] != "" and ingredient["primary_unit"]["quantity"] is not None:
primary_unit_quantity = (
convert_fraction_unicode_chars_to_strings(ingredient["primary_unit"]["quantity"]) + " "
)
primary_unit_display = ""
if ingredient["primary_unit"]["display"] != "" and ingredient["primary_unit"]["display"] is not None:
primary_unit_display = ingredient["primary_unit"]["display"] + " "
extra_comment = ingredient["extra_comment"]
if ingredient["extra_comment"] != "":
extra_comment = ", " + ingredient["extra_comment"]
ingredients += f"\n• {primary_unit_quantity}{primary_unit_display}{ingredient['name']}{extra_comment}"
ingredients += "\n"
# Concatenate instructions from ingredient_and_instructions[row["slug"]]["instructions"] and iterate a number per instruction
instructions = ""
# Remove last instruction if it is "Enjoy!"
if ingredient_and_instructions[row["slug"]]["instructions"][-1]["display_text"] == "Enjoy!":
ingredient_and_instructions[row["slug"]]["instructions"] = ingredient_and_instructions[row["slug"]][
"instructions"
][:-1]
for i, instruction in enumerate(ingredient_and_instructions[row["slug"]]["instructions"]):
instructions += f"\n{i+1}. {convert_fraction_unicode_chars_to_strings(instruction['display_text'])}"
# Constuct the full response
response = f"""Here's a recipe for {recipe_name}:
Ingredients:
{ingredients}
Instructions:
{instructions}
Enjoy your {recipe_name}!"""
recipes.append(
{
"INSTRUCTION": random.choice(ONE_STEP_TEMPLATES).format(title=recipe_name),
"RESPONSE": response,
"SOURCE": data_source,
}
)
recipes = pd.DataFrame(recipes)
# Print the first 5 rows of the dataframe with full width and newline characters correctly displayed in the RESPONSE column
with pd.option_context("display.max_colwidth", -1):
# Assuming the variable df contains the relevant DataFrame
display(
recipes.head().style.set_properties(
**{
"text-align": "left",
"white-space": "pre-wrap",
}
)
)
# Upload dataset to HF
recipes.to_parquet("dataset.parquet", row_group_size=100, engine="pyarrow")
ds = Dataset.from_parquet("dataset.parquet")
# Uncomment to push dataset to HF
ds.push_to_hub("dctanner/oa_recipes")