data/datasets/cmu_wiki_qa/cmu_parser.ipynb
This notebook parses the corpus of manually-generated factoid questions from Wikipedia articles.
Acknowledgments http://www.cs.cmu.edu/~ark/QA-data/
These data were collected by Noah Smith, Michael Heilman, Rebecca Hwa, Shay Cohen, Kevin Gimpel, and many students at Carnegie Mellon University and the University of Pittsburgh between 2008 and 2010.
Their research project was supported by NSF IIS-0713265 (to Smith), an NSF Graduate Research Fellowship (to Heilman), NSF IIS-0712810 and IIS-0745914 (to Hwa), and Institute of Education Sciences, U.S. Department of Education R305B040063 (to Carnegie Mellon).
# uncomment and run below lines to set up if running in colab
# !git clone https://github.com/LAION-AI/Open-Assistant.git
# %cd Open-Assistant/data/datasets/cmu_wiki_qa/
# !pip install -r requirements.txt
# download the dataset
import requests
import tarfile
with requests.get("http://www.cs.cmu.edu/~ark/QA-data/data/Question_Answer_Dataset_v1.2.tar.gz", stream=True) as resp:
with tarfile.open(fileobj=resp.raw) as f:
f.extractall() # extract to ./Question_Answer_Dataset_v1.2
print("Done.")
# global settings
FOLDER = "Question_Answer_Dataset_v1.2" # input folder containing subfolders with Q&A csv files
CSV = "question_answer_pairs.txt" # csv files to look for in these subfolders
SOURCE = "wikipedia/cmu_qa" # source to use in the parquet for each row
MUST_INCLUDE_ENTITY = True # questions must include a reference to the article
MUST_BE_QUALITY = True # only quality answers are accepted
import os
import io
import re
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
# get all files and concatenate them into a single df
data = None
for file in os.listdir(FOLDER):
if os.path.isdir(os.path.join(FOLDER, file)):
for match in os.listdir(os.path.join(FOLDER, file)):
if match.strip() == CSV:
match = os.path.join(FOLDER, file, match)
with open(match, "r", encoding="ISO-8859-1") as m:
raw = m.read().encode("utf-8").decode("utf-8")
data = pd.concat([data, pd.read_csv(io.StringIO(raw), sep="\t")])
data.drop_duplicates()
data.head()
# clean up the df, remove duplicates and answers that are way too short, etc.
clean = {col: [] for col in ["INSTRUCTION", "RESPONSE", "SOURCE", "METADATA"]}
for name, group in tqdm(data.groupby("ArticleTitle")):
entity = "|".join([re.escape(item) for item in name.lower().strip().split("_")])
for question, qgroup in group.groupby("Question"):
# see if the question includes the article title (instead of referring to it, such as he, she, it, etc.)
if MUST_INCLUDE_ENTITY and not re.findall(r"(?i)\b(" + entity + r").?", question):
continue
# use the longest answer
qgroup.sort_values(by="Answer", key=lambda x: x.str.len(), inplace=True, ascending=False)
for _, row in qgroup.iterrows():
if not row["Answer"] or pd.isna(row["Answer"]):
continue # no answer given
quality = True
if re.findall(r"(?i)^(yes|no)\W*$", str(row["Answer"])):
quality = False # yes / no answer
elif len(row["Answer"]) < 4:
quality = False # short answer
elif len(row["Answer"].split()) < 2:
quality = False # one word answer
if not quality and MUST_BE_QUALITY:
continue # skip low quality
clean["INSTRUCTION"].append(question.strip())
clean["RESPONSE"].append(row["Answer"])
clean["SOURCE"].append(SOURCE)
clean["METADATA"].append(
json.dumps(
{
"article_title": name,
"article_file": row["ArticleFile"],
"difficulty_questioner": row["DifficultyFromQuestioner"],
"diffculty_answerer": row["DifficultyFromAnswerer"],
# "quality": quality,
}
)
)
break # include only one answer for each question
# remove accidental duplicates
clean = pd.DataFrame(clean)
clean.sort_values(by="RESPONSE", key=lambda x: x.str.len(), inplace=True, ascending=False)
clean.drop_duplicates(subset=["INSTRUCTION"], inplace=True)
clean.sort_index(inplace=True)
clean.head()
print(f"Retrieved {len(clean) / len(data) * 100.:.2f}% of all questions ({len(clean)})")