data/datasets/instructional_codesearchnet_python/GenerateOpenAssistantInstructionResponseFormat.ipynb
!pip install datasets tqdm lemminflect
import gzip
import json
import pandas as pd
from collections import defaultdict
from datasets import load_dataset
from tqdm.auto import tqdm
from random import random, randint
from lemminflect import getAllInflections, getLemma
ONE_STEP_OUPUT_CODE_TEMPLATES = [
# VBZ
"Can you write a program in {lang} where it\n",
"How would you implement a function in {lang} that\n",
"Can you create a {lang} program that\n",
"Can you implement a function in {lang} that\n",
# VBP
"Implement a function in {lang} to\n",
"How would you code a program in {lang} to\n",
"Write a {lang} script to\n",
"Create a {lang} function to\n",
"Write a {lang} program that can\n",
# VBG
"Write a {lang} script for\n",
"Write a {lang} function for\n",
"Create a {lang} function for\n",
"Implement a {lang} function for\n",
]
ONE_STEP_OUPUT_SUMMARY_TEMPLATES = [
# General answer
"Explain what the following {lang} code does\n",
"Can you tell what is the following {lang} function doing\n",
"Here you have a function in {lang}, explain what it does\n",
"Make a summary of the following {lang} code\n",
"Can you generate a brief explanation for the following {lang} code\n",
"How would you explain what the following {lang} function does\n",
# Documentation
"Can you generate the documentation for the following {lang} function\n",
"Create a docstring for the following {lang} code\n",
"Given the following {lang} function, write the documentation\n",
"Write a docstring for the following {lang} function\n",
]
def remove_docstring(code_function):
triple_quotes = '"""'
lines = code_function.split("\n")
c = lines[1].count(triple_quotes)
# There is no docstring
if c == 0:
return code_function
# One line dostring
if c == 2:
return "\n".join([lines[0]] + lines[2:])
idx = 2
while idx < len(lines) and triple_quotes not in lines[idx]:
idx += 1
return "\n".join([lines[0]] + lines[idx + 1 :])
def process_summary(summary, tag):
words = summary.split()
lemma = getLemma(words[0].lower(), upos="VERB")[0]
inflections = getAllInflections(lemma)
if tag not in inflections:
words[0] = words[0].lower()
else:
words[0] = inflections[tag][0]
return " ".join(words)
lang = "Python 3"
data = defaultdict(list)
dataset = load_dataset("Nan-Do/code-search-net-python")
for data_point in tqdm(dataset["train"]):
code = data_point["original_string"]
summary = data_point["summary"]
data["SOURCE"].append("codesearchnet")
# Generate code
if random() > 0.5:
idx = randint(0, len(ONE_STEP_OUPUT_CODE_TEMPLATES) - 1)
if 0 <= idx <= 3:
tag = "VBZ"
elif 4 <= idx <= 8:
tag = "VBP"
else:
tag = "VBG"
summary = process_summary(summary, tag)
template = ONE_STEP_OUPUT_CODE_TEMPLATES[idx].format(lang=lang) + summary
data["INSTRUCTION"].append(template)
data["RESPONSE"].append(code)
# Generate summary
else:
# We are generating the docstring or a summary so we better remove it from
# the function
# if random() < 0.9:
# code = remove_docstring(code)
code = remove_docstring(code)
idx = randint(0, len(ONE_STEP_OUPUT_SUMMARY_TEMPLATES) - 1)
template = ONE_STEP_OUPUT_SUMMARY_TEMPLATES[idx].format(lang=lang) + code
data["INSTRUCTION"].append(template)
if idx <= 5:
data["RESPONSE"].append(summary)
else:
data["RESPONSE"].append('"""' + summary + '"""')
df = pd.DataFrame(data=data)
df.to_parquet("instructional_dataset.parquet", row_group_size=100, engine="pyarrow", index=False)
from huggingface_hub import notebook_login
notebook_login()
from datasets import Dataset
ds = Dataset.from_parquet("instructional_dataset.parquet")
ds.push_to_hub("Nan-Do/instructional_code-search-net-python")