Convert the Ubuntu Dialogue Corpus into Q&A pairs

Select the short Q&A discussions only from the Ubuntu Dialogue Corpus.

Acknowledgments https://www.kaggle.com/datasets/rtatman/ubuntu-dialogue-corpus

This dataset was ORIGINALLY collected by Ryan Lowe, Nissan Pow , Iulian V. Serban† and Joelle Pineau. It is made available here under the Apache License, 2.0. If you use this data in your work, please include the following citation:

Ryan Lowe, Nissan Pow, Iulian V. Serban and Joelle Pineau, "The Ubuntu Dialogue Corpus: A Large Dataset for Research in Unstructured Multi-Turn Dialogue Systems", SIGDial 2015. URL: http://www.sigdial.org/workshops/conference16/proceedings/pdf/SIGDIAL40.pdf

python

# uncomment and run below lines to set up if running in colab
# !git clone https://github.com/LAION-AI/Open-Assistant.git
# %cd Open-Assistant/data/datasets/ubuntu_dialogue/
# !pip install -r requirements.txt

python

# download data, you can get your kaggle.json file from your account page https://www.kaggle.com/me/account
import kaggle

kaggle.api.dataset_download_files("rtatman/ubuntu-dialogue-corpus", unzip=True)

python

# global settings

FOLDER = "Ubuntu-dialogue-corpus"  # input folder containing ubuntu dialogue csv files
SOURCE = "ubuntu-dialogue"  # source to use in the parquet for each row

python

import os
import re
import json

from tqdm import tqdm

import numpy as np
import pandas as pd

python

def load(file):
    data = pd.read_csv(file)
    data["date"] = pd.to_datetime(data["date"])
    data["id"] = data[["folder", "dialogueID"]].apply(lambda x: f"{x[0]}_{x[1].split('.tsv')[0]}", axis=1)
    data.drop(columns=["folder", "dialogueID"], inplace=True)
    return data


data = None
for file in tqdm(os.listdir(FOLDER)):
    data = pd.concat([data, load(os.path.join(FOLDER, file))])

data.head()

python

# clean up the df, remove duplicates and answers that are way too short, etc.
clean = {col: [] for col in ["INSTRUCTION", "RESPONSE", "SOURCE", "METADATA"]}

for name, group in tqdm(data.groupby("id")):
    if len(group) < 3 or len(group) > 5:  # 3, 4, 5 len
        continue  # back and forth will most likely not be parsed correctly

    group.sort_values(by=["date"], ascending=True, inplace=True)
    instruction = str(group["text"].values[0]).strip()
    insturction_user = group["from"].values[0]
    if not instruction or pd.isna(instruction) or len(instruction) < 12:
        continue
    if not re.findall(
        r"(?i)(?:\?|what|who|where|why|when|how|whose|explain|tell|does|way|can|know|able|best|recommend)", instruction
    ):
        continue  # probably not a question

    all_recipients = "|".join(
        [re.escape(item) for item in set(group["to"].tolist() + group["from"].tolist()) if pd.notna(item)]
    )
    response = None
    response_user = None
    for _, row in group.iterrows():
        if row["to"] == insturction_user:
            candidate = str(row["text"]).strip()
            if (
                not row["text"]
                or pd.isna(row["text"])
                or re.findall(r"(?i)^(yes|yep|yeah|no|nah|nope|sure|yes\s*sir)\W*$", candidate)
            ):
                continue  # answer is not expressive
            if len(candidate) < 3:
                continue  # too short
            if re.findall(r"(?i)(?:wrong|o[nf].*?topic|else\s*where|ask.+?in|\#\w+|google|you.+?mean)", candidate):
                continue  # probably off topic
            if re.findall(r"\b(" + all_recipients + r")\b", candidate):
                continue  # answer includes user name(s)
            response = candidate
            response_user = row["from"]
        elif response_user is not None and row["to"] == response_user and row["from"] == insturction_user:
            if re.findall(r"(?i)(?:thank|thx|works|working|great)", str(row["text"])):
                clean["INSTRUCTION"].append(instruction)
                clean["RESPONSE"].append(response)
                clean["SOURCE"].append(SOURCE)
                clean["METADATA"].append(json.dumps({"user_question": insturction_user, "user_answer": response_user}))
                break

python

clean = pd.DataFrame(clean)
clean.sort_values(by="RESPONSE", key=lambda x: x.str.len(), inplace=True, ascending=False)
clean.drop_duplicates(subset=["INSTRUCTION"], inplace=True)
clean.sort_index(inplace=True)
clean.head()

python

print(f"Retrieved {len(clean) / len(data['id'].unique()) * 100.:.2f}% of all questions ({len(clean)})")  # 19921

python

for index, row in clean.iterrows():
    print("Q >", row["INSTRUCTION"])
    print("A >", row["RESPONSE"])
    print()
    if index > 100:
        break