notebooks/data-augmentation/movie-dialogs/convert-to-instruction-format.ipynb
from datasets import load_dataset
import numpy as np
import json
from tqdm import tqdm
IMDB = 7.0
Templates for converting dialogs to prompts
DIALOG_TEMPLATES = {
### template for 4+ line dialogs
"four_more_lines": [
"""
Here's a {template} between {char1} and {char2} in a scene from a {genre} movie
{dialogue1}
User : Can you continue the {template}
Assistant : Sure, the next dialogue for this scene could be
{dialogue2}
""",
"""
{dialogue1}
User : Can you provide more dialog assuming {genre} movie
{dialogue2}
""",
"""
I'm trying to complete the dialog for my characters {char1} and {char2}. Here's the {template}, Please help me complete it
{dialogue1}
Assistant : Sure
{dialogue2}
""",
"""
User : Assume {char1} and {char2} are characters from a {genre} movie, continue the conversation between them
{dialogue1}
Assistant : Sure
{dialogue2}
""",
],
## template for 4 line dialogs
"four_lines": [
"""
{dialogue1}
User : provide a response assuming you're {char2}
Assistant : Sure
{dialogue2}
""",
"""
{dialogue1}
User : respond as {char2} to complete the conversation
Assistant : Sure
{dialogue2}
""",
],
}
! wget wget https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip
! unzip movie-corpus.zip -d ./Data/
def get_movie_dialogs():
with open("./Data/movie-corpus/utterances.jsonl", "r") as json_file:
conversations = list(json_file)
speakers = json.load(open("./Data/movie-corpus/speakers.json"))
movie_dialog_dict = {}
for dialog in tqdm(conversations):
dialog = eval(dialog.replace("null", "None"))
movie_dialog_dict[dialog["id"]] = {
"characterName": speakers[dialog["speaker"]]["meta"]["character_name"],
"text": dialog["text"],
"characterID": dialog["speaker"],
}
return movie_dialog_dict
def get_dialogs(dialog_dict, start, end):
dialog_list = []
for idx in range(start, end + 1):
dialog_list.append(dialog_dict[f"L{idx}"]["characterName"] + ": " + dialog_dict[f"L{idx}"]["text"])
num_lines = len(dialog_list)
assert num_lines >= 1, "Number of lines should be greater than one"
if num_lines < 6:
dialog1 = "\n ".join(dialog_list[:-1])
dialog2 = dialog_list[-1]
else:
dialog_len = np.random.randint(3, (num_lines // 2) + 1)
dialog1 = "\n ".join(dialog_list[:dialog_len])
dialog2 = "\n ".join(dialog_list[dialog_len:])
return dialog1, dialog2
def choose_prompt(num_lines):
assert num_lines >= 1, "Number of lines should be greater than one"
if num_lines < 6:
prompt = np.random.choice(DIALOG_TEMPLATES["four_lines"])
else:
prompt = np.random.choice(DIALOG_TEMPLATES["four_more_lines"])
return prompt
def convert_to_prompts(dataset, movie_dialog_dict, output_dir=".", split="train"):
with open(f"{output_dir}/{split}.jsonl", "w", encoding="utf8") as output:
i = 0
while i < len(dataset["train"]):
data = dataset[split][i]
if float(data["movieIMDBRating"].strip()) >= IMDB:
max_lines = np.random.randint(7, 12)
lineids = [int(lineid[1:]) for lineid in data["utterance"]["LineID"]]
num_lines = len(lineids)
char_ids = sorted([data["characterID1"].strip(), data["characterID1"].strip()])
while num_lines < max_lines:
i += 1
data = dataset[split][i]
char_id_new = sorted([data["characterID1"].strip(), data["characterID1"].strip()])
## make sure that characters are the same
if char_id_new == char_ids:
lineids_new = [int(lineid[1:]) for lineid in data["utterance"]["LineID"]]
if lineids_new[0] == (lineids[-1] + 1): ##ensure continuity
lineids.extend(lineids_new)
else:
break
else:
break
num_lines = len(lineids)
genre = "-".join(data["movieGenres"][:2])
template = np.random.choice(["dialog", "script", "play"])
char1 = movie_dialog_dict[f"L{lineids[0]}"]["characterName"]
if num_lines < 6:
if num_lines % 2 == 0:
char2 = movie_dialog_dict[f"L{lineids[1]}"]["characterName"]
else:
char2 = char1
else:
char2 = movie_dialog_dict[f"L{lineids[1]}"]["characterName"]
dialogue1, dialogue2 = get_dialogs(movie_dialog_dict, lineids[0], lineids[-1])
prompt = choose_prompt(num_lines)
prompt = prompt.format(
char1=char1, char2=char2, dialogue1=dialogue1, dialogue2=dialogue2, genre=genre, template=template
)
output.write(f"{json.dumps({'conversation': prompt})}\n")
i += 1
movie_dialog_dict = get_movie_dialogs()
dataset = load_dataset("cornell_movie_dialog")
convert_to_prompts(dataset, movie_dialog_dict)
dataset_ = load_dataset("json", data_files={"train": "./train.jsonl"})
dataset_.push_to_hub("shahules786/OA-cornell-movies-dialog")
dataset_ = load_dataset("shahules786/OA-cornell-movies-dialog")
for i in range(10, 15):
print("##")
print(dataset_["train"][i]["conversation"])
dataset_["train"]