Parser for Question-Answer Dataset

This notebook parses the corpus of manually-generated factoid questions from Wikipedia articles.

Acknowledgments http://www.cs.cmu.edu/~ark/QA-data/

These data were collected by Noah Smith, Michael Heilman, Rebecca Hwa, Shay Cohen, Kevin Gimpel, and many students at Carnegie Mellon University and the University of Pittsburgh between 2008 and 2010.

Their research project was supported by NSF IIS-0713265 (to Smith), an NSF Graduate Research Fellowship (to Heilman), NSF IIS-0712810 and IIS-0745914 (to Hwa), and Institute of Education Sciences, U.S. Department of Education R305B040063 (to Carnegie Mellon).

python

# uncomment and run below lines to set up if running in colab
# !git clone https://github.com/LAION-AI/Open-Assistant.git
# %cd Open-Assistant/data/datasets/cmu_wiki_qa/
# !pip install -r requirements.txt

python

# download the dataset

import requests
import tarfile

with requests.get("http://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz", stream=True) as resp:
    with tarfile.open(fileobj=resp.raw) as f:
        f.extractall()  # extract to ./Question_Answer_Dataset_v1.2

print("Done.")

python

# global settings

FOLDER = "Question_Answer_Dataset_v1.2"  # input folder containing subfolders with Q&A csv files
CSV = "question_answer_pairs.txt"  # csv files to look for in these subfolders
SOURCE = "wikipedia/cmu_qa"  # source to use in the parquet for each row
MUST_INCLUDE_ENTITY = True  # questions must include a reference to the article
MUST_BE_QUALITY = True  # only quality answers are accepted

python

import os
import io
import re
import json

from tqdm import tqdm

import numpy as np
import pandas as pd

python

# get all files and concatenate them into a single df
data = None
for file in os.listdir(FOLDER):
    if os.path.isdir(os.path.join(FOLDER, file)):
        for match in os.listdir(os.path.join(FOLDER, file)):
            if match.strip() == CSV:
                match = os.path.join(FOLDER, file, match)
                with open(match, "r", encoding="ISO-8859-1") as m:
                    raw = m.read().encode("utf-8").decode("utf-8")
                data = pd.concat([data, pd.read_csv(io.StringIO(raw), sep="\t")])
data.drop_duplicates()
data.head()

python

# clean up the df, remove duplicates and answers that are way too short, etc.
clean = {col: [] for col in ["INSTRUCTION", "RESPONSE", "SOURCE", "METADATA"]}

for name, group in tqdm(data.groupby("ArticleTitle")):
    entity = "|".join([re.escape(item) for item in name.lower().strip().split("_")])
    for question, qgroup in group.groupby("Question"):
        # see if the question includes the article title (instead of referring to it, such as he, she, it, etc.)
        if MUST_INCLUDE_ENTITY and not re.findall(r"(?i)\b(" + entity + r").?", question):
            continue
        # use the longest answer
        qgroup.sort_values(by="Answer", key=lambda x: x.str.len(), inplace=True, ascending=False)
        for _, row in qgroup.iterrows():
            if not row["Answer"] or pd.isna(row["Answer"]):
                continue  # no answer given
            quality = True
            if re.findall(r"(?i)^(yes|no)\W*$", str(row["Answer"])):
                quality = False  # yes / no answer
            elif len(row["Answer"]) < 4:
                quality = False  # short answer
            elif len(row["Answer"].split()) < 2:
                quality = False  # one word answer
            if not quality and MUST_BE_QUALITY:
                continue  # skip low quality

            clean["INSTRUCTION"].append(question.strip())
            clean["RESPONSE"].append(row["Answer"])
            clean["SOURCE"].append(SOURCE)
            clean["METADATA"].append(
                json.dumps(
                    {
                        "article_title": name,
                        "article_file": row["ArticleFile"],
                        "difficulty_questioner": row["DifficultyFromQuestioner"],
                        "diffculty_answerer": row["DifficultyFromAnswerer"],
                        # "quality": quality,
                    }
                )
            )
            break  # include only one answer for each question

python

# remove accidental duplicates
clean = pd.DataFrame(clean)
clean.sort_values(by="RESPONSE", key=lambda x: x.str.len(), inplace=True, ascending=False)
clean.drop_duplicates(subset=["INSTRUCTION"], inplace=True)
clean.sort_index(inplace=True)
clean.head()

python

print(f"Retrieved {len(clean) / len(data) * 100.:.2f}% of all questions ({len(clean)})")