Back to Open Assistant

GenerateOpenAssistantInstructionResponseFormat

data/datasets/instructional_codesearchnet_python/GenerateOpenAssistantInstructionResponseFormat.ipynb

0.0.14.0 KB
Original Source
python
!pip install datasets tqdm lemminflect
python
import gzip
import json
import pandas as pd

from collections import defaultdict
from datasets import load_dataset
from tqdm.auto import tqdm
from random import random, randint
from lemminflect import getAllInflections, getLemma

ONE_STEP_OUPUT_CODE_TEMPLATES = [
    # VBZ
    "Can you write a program in {lang} where it\n",
    "How would you implement a function in {lang} that\n",
    "Can you create a {lang} program that\n",
    "Can you implement a function in {lang} that\n",
    # VBP
    "Implement a function in {lang} to\n",
    "How would you code a program in {lang} to\n",
    "Write a {lang} script to\n",
    "Create a {lang} function to\n",
    "Write a {lang} program that can\n",
    # VBG
    "Write a {lang} script for\n",
    "Write a {lang} function for\n",
    "Create a {lang} function for\n",
    "Implement a {lang} function for\n",
]

ONE_STEP_OUPUT_SUMMARY_TEMPLATES = [
    # General answer
    "Explain what the following {lang} code does\n",
    "Can you tell what is the following {lang} function doing\n",
    "Here you have a function in {lang}, explain what it does\n",
    "Make a summary of the following {lang} code\n",
    "Can you generate a brief explanation for the following {lang} code\n",
    "How would you explain what the following {lang} function does\n",
    # Documentation
    "Can you generate the documentation for the following {lang} function\n",
    "Create a docstring for the following {lang} code\n",
    "Given the following {lang} function, write the documentation\n",
    "Write a docstring for the following {lang} function\n",
]


def remove_docstring(code_function):
    triple_quotes = '"""'
    lines = code_function.split("\n")

    c = lines[1].count(triple_quotes)
    # There is no docstring
    if c == 0:
        return code_function
    # One line dostring
    if c == 2:
        return "\n".join([lines[0]] + lines[2:])

    idx = 2
    while idx < len(lines) and triple_quotes not in lines[idx]:
        idx += 1

    return "\n".join([lines[0]] + lines[idx + 1 :])


def process_summary(summary, tag):
    words = summary.split()
    lemma = getLemma(words[0].lower(), upos="VERB")[0]
    inflections = getAllInflections(lemma)

    if tag not in inflections:
        words[0] = words[0].lower()
    else:
        words[0] = inflections[tag][0]

    return " ".join(words)


lang = "Python 3"
data = defaultdict(list)
dataset = load_dataset("Nan-Do/code-search-net-python")

for data_point in tqdm(dataset["train"]):
    code = data_point["original_string"]
    summary = data_point["summary"]
    data["SOURCE"].append("codesearchnet")
    # Generate code
    if random() > 0.5:
        idx = randint(0, len(ONE_STEP_OUPUT_CODE_TEMPLATES) - 1)
        if 0 <= idx <= 3:
            tag = "VBZ"
        elif 4 <= idx <= 8:
            tag = "VBP"
        else:
            tag = "VBG"
        summary = process_summary(summary, tag)
        template = ONE_STEP_OUPUT_CODE_TEMPLATES[idx].format(lang=lang) + summary
        data["INSTRUCTION"].append(template)
        data["RESPONSE"].append(code)
    # Generate summary
    else:
        # We are generating the docstring or a summary so we better remove it from
        # the function
        # if random() < 0.9:
        #    code = remove_docstring(code)
        code = remove_docstring(code)
        idx = randint(0, len(ONE_STEP_OUPUT_SUMMARY_TEMPLATES) - 1)
        template = ONE_STEP_OUPUT_SUMMARY_TEMPLATES[idx].format(lang=lang) + code
        data["INSTRUCTION"].append(template)
        if idx <= 5:
            data["RESPONSE"].append(summary)
        else:
            data["RESPONSE"].append('"""' + summary + '"""')

df = pd.DataFrame(data=data)
df.to_parquet("instructional_dataset.parquet", row_group_size=100, engine="pyarrow", index=False)
python
from huggingface_hub import notebook_login

notebook_login()
python
from datasets import Dataset

ds = Dataset.from_parquet("instructional_dataset.parquet")
ds.push_to_hub("Nan-Do/instructional_code-search-net-python")