data/datasets/ubuntu_dialogue_qa/ubuntu_parser.ipynb
Select the short Q&A discussions only from the Ubuntu Dialogue Corpus.
Acknowledgments https://www.kaggle.com/datasets/rtatman/ubuntu-dialogue-corpus
This dataset was ORIGINALLY collected by Ryan Lowe, Nissan Pow , Iulian V. Serban† and Joelle Pineau. It is made available here under the Apache License, 2.0. If you use this data in your work, please include the following citation:
Ryan Lowe, Nissan Pow, Iulian V. Serban and Joelle Pineau, "The Ubuntu Dialogue Corpus: A Large Dataset for Research in Unstructured Multi-Turn Dialogue Systems", SIGDial 2015. URL: http://www.sigdial.org/workshops/conference16/proceedings/pdf/SIGDIAL40.pdf
# uncomment and run below lines to set up if running in colab
# !git clone https://github.com/LAION-AI/Open-Assistant.git
# %cd Open-Assistant/data/datasets/ubuntu_dialogue/
# !pip install -r requirements.txt
# download data, you can get your kaggle.json file from your account page https://www.kaggle.com/me/account
import kaggle
kaggle.api.dataset_download_files("rtatman/ubuntu-dialogue-corpus", unzip=True)
# global settings
FOLDER = "Ubuntu-dialogue-corpus" # input folder containing ubuntu dialogue csv files
SOURCE = "ubuntu-dialogue" # source to use in the parquet for each row
import os
import re
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
def load(file):
data = pd.read_csv(file)
data["date"] = pd.to_datetime(data["date"])
data["id"] = data[["folder", "dialogueID"]].apply(lambda x: f"{x[0]}_{x[1].split('.tsv')[0]}", axis=1)
data.drop(columns=["folder", "dialogueID"], inplace=True)
return data
data = None
for file in tqdm(os.listdir(FOLDER)):
data = pd.concat([data, load(os.path.join(FOLDER, file))])
data.head()
# clean up the df, remove duplicates and answers that are way too short, etc.
clean = {col: [] for col in ["INSTRUCTION", "RESPONSE", "SOURCE", "METADATA"]}
for name, group in tqdm(data.groupby("id")):
if len(group) < 3 or len(group) > 5: # 3, 4, 5 len
continue # back and forth will most likely not be parsed correctly
group.sort_values(by=["date"], ascending=True, inplace=True)
instruction = str(group["text"].values[0]).strip()
insturction_user = group["from"].values[0]
if not instruction or pd.isna(instruction) or len(instruction) < 12:
continue
if not re.findall(
r"(?i)(?:\?|what|who|where|why|when|how|whose|explain|tell|does|way|can|know|able|best|recommend)", instruction
):
continue # probably not a question
all_recipients = "|".join(
[re.escape(item) for item in set(group["to"].tolist() + group["from"].tolist()) if pd.notna(item)]
)
response = None
response_user = None
for _, row in group.iterrows():
if row["to"] == insturction_user:
candidate = str(row["text"]).strip()
if (
not row["text"]
or pd.isna(row["text"])
or re.findall(r"(?i)^(yes|yep|yeah|no|nah|nope|sure|yes\s*sir)\W*$", candidate)
):
continue # answer is not expressive
if len(candidate) < 3:
continue # too short
if re.findall(r"(?i)(?:wrong|o[nf].*?topic|else\s*where|ask.+?in|\#\w+|google|you.+?mean)", candidate):
continue # probably off topic
if re.findall(r"\b(" + all_recipients + r")\b", candidate):
continue # answer includes user name(s)
response = candidate
response_user = row["from"]
elif response_user is not None and row["to"] == response_user and row["from"] == insturction_user:
if re.findall(r"(?i)(?:thank|thx|works|working|great)", str(row["text"])):
clean["INSTRUCTION"].append(instruction)
clean["RESPONSE"].append(response)
clean["SOURCE"].append(SOURCE)
clean["METADATA"].append(json.dumps({"user_question": insturction_user, "user_answer": response_user}))
break
clean = pd.DataFrame(clean)
clean.sort_values(by="RESPONSE", key=lambda x: x.str.len(), inplace=True, ascending=False)
clean.drop_duplicates(subset=["INSTRUCTION"], inplace=True)
clean.sort_index(inplace=True)
clean.head()
print(f"Retrieved {len(clean) / len(data['id'].unique()) * 100.:.2f}% of all questions ({len(clean)})") # 19921
for index, row in clean.iterrows():
print("Q >", row["INSTRUCTION"])
print("A >", row["RESPONSE"])
print()
if index > 100:
break