notebooks/TSSB-3M-bugs-dataset/TSSB-3M-bugs_dataset.ipynb
The ManySStuBs4J corpus is a collection of simple fixes to Java bugs, designed for evaluating program repair techniques. We collect all bug-fixing changes using the SZZ heuristic, and then filter these to obtain a data set of small bug fix changes.
!wget https://zenodo.org/record/5845439/files/tssb_data_3M.zip?download=1
!mv tssb_data_3M.zip?download=1 tssb_data_3M.zip
!unzip tssb_data_3M.zip
import pandas
FILENUM = 32
table = pandas.read_json(f"tssb_data_3M/file-{FILENUM}.jsonl.gz", lines=True)
table
urls = set()
total_rows = 0
for num in range(0, 34):
table = pandas.read_json(f"tssb_data_3M/file-{FILENUM}.jsonl.gz", lines=True)
for index, row in table.iterrows():
total_rows += 1
urls.add(row["project"])
print(len(urls))
print("total rows", total_rows)
print(urls)
!pip install PyGithub
from github import Github
g = Github()
# TO DO, find a way to get a commmit from SHA
# 1. Use GitHub API
# 2. Download repos with their history
# 3. Web scaping
import re
TEMPLATE = """User: Find the bug in the following code:
{}
Reply: The fixed code is:
{}
"""
def remove_starting_plus_minus(text):
if text.startswith("+") or text.startswith("-"):
return text[1:]
else:
return text
def remove_extraneous_diff_info(text):
pattern = "@@.*@@"
return re.sub(pattern, "", text)
def clean(text):
return remove_extraneous_diff_info(remove_starting_plus_minus(text))
def write_prompts(num, table):
with open(f"generated_bugfix_prompts/prompts_{num}.txt", "w+") as f:
for index, row in table.iterrows():
# lines starting with "+" are the "new" lines, after the modification to fix the bug
# removing them gives us the old code, with the bug
correct = "\n".join(clean(line) for line in row["diff"].split("\n") if not line.startswith("+"))
# lines starting with "-" are the bugged lines, removing them gives the "new" fixed code
wrong = "\n".join(clean(line) for line in row["diff"].split("\n") if not line.startswith("-"))
f.write(TEMPLATE.format(correct, wrong))
!mkdir generated_bugfix_prompts
for num in range(34):
table = pandas.read_json(f"tssb_data_3M/file-{num}.jsonl.gz", lines=True)
write_prompts(num, table)
!zip -r generated_bugfix_prompts.zip generated_bugfix_prompts