Back to Open Assistant

Convert To Instruction Format

notebooks/data-augmentation/movie-dialogs/convert-to-instruction-format.ipynb

0.0.16.0 KB
Original Source

Imports

python
from datasets import load_dataset
import numpy as np
import json
from tqdm import tqdm

IMDB = 7.0

Dialog templates

Templates for converting dialogs to prompts

python
DIALOG_TEMPLATES = {
    ### template for 4+ line dialogs
    "four_more_lines": [
        """
Here's a {template} between {char1} and {char2} in a scene from a {genre} movie
    {dialogue1}
User : Can you continue the {template}
Assistant : Sure, the next dialogue for this scene could be
    {dialogue2}
 """,
        """
    {dialogue1}
User : Can you provide more dialog assuming {genre} movie
    {dialogue2}
""",
        """
I'm trying to complete the dialog for my characters {char1} and {char2}. Here's the {template}, Please help me complete it
    {dialogue1}
Assistant : Sure
    {dialogue2}
""",
        """
User : Assume {char1} and {char2} are characters from a {genre} movie, continue the conversation between them
    {dialogue1}
Assistant : Sure
    {dialogue2}
""",
    ],
    ## template for 4 line dialogs
    "four_lines": [
        """
    {dialogue1}
User : provide a response assuming you're {char2}
Assistant : Sure
    {dialogue2}
""",
        """
    {dialogue1}
User : respond as {char2} to complete the conversation
Assistant : Sure
    {dialogue2}
""",
    ],
}
  • Download Cornell-movies dialog dataset
python
! wget wget https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip
! unzip movie-corpus.zip -d ./Data/

Code

python
def get_movie_dialogs():
    with open("./Data/movie-corpus/utterances.jsonl", "r") as json_file:
        conversations = list(json_file)
    speakers = json.load(open("./Data/movie-corpus/speakers.json"))
    movie_dialog_dict = {}
    for dialog in tqdm(conversations):
        dialog = eval(dialog.replace("null", "None"))
        movie_dialog_dict[dialog["id"]] = {
            "characterName": speakers[dialog["speaker"]]["meta"]["character_name"],
            "text": dialog["text"],
            "characterID": dialog["speaker"],
        }

    return movie_dialog_dict
python
def get_dialogs(dialog_dict, start, end):
    dialog_list = []
    for idx in range(start, end + 1):
        dialog_list.append(dialog_dict[f"L{idx}"]["characterName"] + ": " + dialog_dict[f"L{idx}"]["text"])
    num_lines = len(dialog_list)

    assert num_lines >= 1, "Number of lines should be greater than one"

    if num_lines < 6:
        dialog1 = "\n    ".join(dialog_list[:-1])
        dialog2 = dialog_list[-1]
    else:
        dialog_len = np.random.randint(3, (num_lines // 2) + 1)
        dialog1 = "\n    ".join(dialog_list[:dialog_len])
        dialog2 = "\n    ".join(dialog_list[dialog_len:])

    return dialog1, dialog2


def choose_prompt(num_lines):
    assert num_lines >= 1, "Number of lines should be greater than one"

    if num_lines < 6:
        prompt = np.random.choice(DIALOG_TEMPLATES["four_lines"])

    else:
        prompt = np.random.choice(DIALOG_TEMPLATES["four_more_lines"])

    return prompt


def convert_to_prompts(dataset, movie_dialog_dict, output_dir=".", split="train"):
    with open(f"{output_dir}/{split}.jsonl", "w", encoding="utf8") as output:
        i = 0
        while i < len(dataset["train"]):
            data = dataset[split][i]
            if float(data["movieIMDBRating"].strip()) >= IMDB:
                max_lines = np.random.randint(7, 12)
                lineids = [int(lineid[1:]) for lineid in data["utterance"]["LineID"]]
                num_lines = len(lineids)
                char_ids = sorted([data["characterID1"].strip(), data["characterID1"].strip()])
                while num_lines < max_lines:
                    i += 1
                    data = dataset[split][i]
                    char_id_new = sorted([data["characterID1"].strip(), data["characterID1"].strip()])
                    ## make sure that characters are the same
                    if char_id_new == char_ids:
                        lineids_new = [int(lineid[1:]) for lineid in data["utterance"]["LineID"]]
                        if lineids_new[0] == (lineids[-1] + 1):  ##ensure continuity
                            lineids.extend(lineids_new)
                        else:
                            break
                    else:
                        break
                    num_lines = len(lineids)

                genre = "-".join(data["movieGenres"][:2])
                template = np.random.choice(["dialog", "script", "play"])
                char1 = movie_dialog_dict[f"L{lineids[0]}"]["characterName"]

                if num_lines < 6:
                    if num_lines % 2 == 0:
                        char2 = movie_dialog_dict[f"L{lineids[1]}"]["characterName"]
                    else:
                        char2 = char1
                else:
                    char2 = movie_dialog_dict[f"L{lineids[1]}"]["characterName"]

                dialogue1, dialogue2 = get_dialogs(movie_dialog_dict, lineids[0], lineids[-1])
                prompt = choose_prompt(num_lines)

                prompt = prompt.format(
                    char1=char1, char2=char2, dialogue1=dialogue1, dialogue2=dialogue2, genre=genre, template=template
                )
                output.write(f"{json.dumps({'conversation': prompt})}\n")
            i += 1
python
movie_dialog_dict = get_movie_dialogs()
dataset = load_dataset("cornell_movie_dialog")
python
convert_to_prompts(dataset, movie_dialog_dict)

Upload as HF Dataset

python
dataset_ = load_dataset("json", data_files={"train": "./train.jsonl"})
python
dataset_.push_to_hub("shahules786/OA-cornell-movies-dialog")

Load Dataset from HF

python
dataset_ = load_dataset("shahules786/OA-cornell-movies-dialog")
python
for i in range(10, 15):
    print("##")
    print(dataset_["train"][i]["conversation"])
python
dataset_["train"]