notebooks/data-augmentation/wikidata-qa/wikidata.ipynb
See README.md for more information.
# uncomment and run below lines to set up if running in colab
# !git clone https://github.com/LAION-AI/Open-Assistant.git
# %cd Open-Assistant/notebooks/data-augmentation/wikidata-qa
# !pip install -r requirements.txt
import requests
import json
import datetime
import time
from copy import deepcopy
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Optional, Any
class WikiGraph:
HEADER = {
"User-Agent": "Mozilla/5.0 (compatible; WikiDataGraphCrawler/0.1)",
}
TIMER = 200 # wait ms between calls
def __init__(self, file: Optional[str] = None, language: str = "en", seed: int = 12345678) -> None:
self.file = file
assert language in ("en",), f"This language is not yet supported: {language}"
self.language = language
self.cache = {self.language: {}}
np.random.seed(seed)
self.calls = 0
self.last_call = 0
if self.file:
self._load()
def _save(self) -> None:
if not self.file:
return
df = {"language": [], "qid": [], "depth": [], "desc": [], "graph": []}
for lang in self.cache:
for qid in self.cache[lang]:
for depth in self.cache[lang][qid]:
df["language"].append(lang)
df["qid"].append(qid)
df["depth"].append(depth)
df["desc"].append(json.dumps(self.cache[lang][qid][depth]["desc"]))
df["graph"].append(json.dumps(self.cache[lang][qid][depth]["graph"]))
df = pd.DataFrame(df)
df.to_csv(self.file, index=False)
def _load(self) -> None:
assert self.file
try:
df = pd.read_csv(self.file)
except FileNotFoundError:
return
self.cache = {}
for index, row in df.iterrows():
if row["language"] not in self.cache:
self.cache[row["language"]] = {}
if row["qid"] not in self.cache[row["language"]]:
self.cache[row["language"]][row["qid"]] = {}
self.cache[row["language"]][row["qid"]][row["depth"]] = {
"desc": row["desc"] if isinstance(row["desc"], dict) else json.loads(row["desc"]),
"graph": row["graph"] if isinstance(row["graph"], dict) else json.loads(row["graph"]),
}
def _get(self, params: dict) -> dict:
self.calls += 1
diff = max(0.0, self.TIMER - (time.time() - self.last_call))
if diff:
time.sleep(diff / 1000.0)
data = requests.get("https://www.wikidata.org/w/api.php", headers=self.HEADER, params=params)
self.last_call = time.time()
result = data.json()
if "error" in result:
raise Exception(result["error"]["code"], result["error"]["info"])
return result
def search(self, query: str) -> list:
params = {"action": "wbsearchentities", "search": query.strip(), "language": self.language, "format": "json"}
result = self._get(params)
if "search" not in result or not result["search"]:
return []
output = []
for item in result["search"]:
allow = False
if "display" in item:
if "label" in item["display"]:
if "language" in item["display"]["label"] and item["display"]["label"]["language"] == self.language:
allow = True
if not allow and "description" in item["display"]:
if (
"language" in item["display"]["description"]
and item["display"]["description"]["language"] == self.language
):
allow = True
if not allow and "match" in item:
if "language" in item["match"] and item["match"]["language"] == self.language:
allow = True
if allow:
output.append({key: item[key] if key in item else "" for key in ["id", "label", "description"]})
return output
def _fetch(self, qid: str, depth: int = 1) -> str:
qid = qid.upper() if isinstance(qid, str) else f"Q{qid}"
if qid in self.cache[self.language] and self.cache[self.language][qid]:
largest = int(sorted(self.cache[self.language][qid].keys())[-1])
if largest >= depth:
return self.cache[self.language][qid][largest]["desc"]["label"]
else:
self.cache[self.language][qid] = {}
params = {"action": "wbgetentities", "ids": qid, "language": self.language, "format": "json"}
result = self._get(params)
if "entities" not in result or qid not in result["entities"] or not result["entities"][qid]:
raise ValueError(f"No entities found for {qid}")
hit = result["entities"][qid]
desc = {"qid": qid, "language": self.language, "label": "", "aliases": [], "description": ""}
if "labels" in hit and self.language in hit["labels"] and "value" in hit["labels"][self.language]:
desc["label"] = hit["labels"][self.language]["value"]
# elif self.language != "en" and "en" in hit["labels"] and "value" in hit["labels"]["en"]:
# desc["label"] = hit["labels"]["en"]["value"]
if "aliases" in hit and self.language in hit["aliases"]:
desc["aliases"] = [item["value"] for item in hit["aliases"][self.language] if "value" in item]
if (
"descriptions" in hit
and self.language in hit["descriptions"]
and "value" in hit["descriptions"][self.language]
):
desc["description"] = hit["descriptions"][self.language]["value"]
graph = {}
if "claims" in hit and depth > 0:
for key in tqdm(hit["claims"]):
if "datavalue" not in hit["claims"][key][0]["mainsnak"]:
continue
results = []
for elem in hit["claims"][key]:
item = elem["mainsnak"]["datavalue"]["value"]
if isinstance(item, dict) and "id" in item and item["id"] == qid:
results.append(desc["label"])
else:
results.append(self._parse(item, qid, depth))
graph[key] = results
self.cache[self.language][qid][depth] = {"desc": deepcopy(desc), "graph": deepcopy(graph)}
self._save()
return desc["label"]
def _parse(self, item: Any, qid: str, depth: int) -> str:
result = ""
if isinstance(item, dict):
if "amount" in item:
unit = item["unit"].split("/Q")[-1] if "unit" in item else ""
result = item["amount"][(1 if item["amount"][0] == "+" else 0) :]
if unit and unit != "1":
unit = f"Q{unit}"
if unit == qid:
unit = desc["label"]
else:
unit = self._fetch(unit, 0)
if unit:
result = f"{result} {unit}"
if "latitude" in item and "longitude" in item:
result = f'{item["latitude"]} {item["longitude"]}'
elif "time" in item:
result = str(item["time"])
if "T00:00:00Z" in result:
result = result.split("T00:00:00Z")[0]
if "-00-00" in result:
result = result.split("-00-00")[0]
if result[0] == "+":
result = result[1:]
elif result[0] == "-":
if self.language == "en":
result = f"{result} BC"
elif "id" in item:
result = self._fetch(item["id"], depth - 1)
elif isinstance(item, (str, int, float, bool)):
result = str(item)
return result
def _zalgo(self, question: str) -> str:
if len(question) > 2 and np.random.choice([True, False]):
if np.random.choice([True, False]):
# make it lowercase or all caps
if np.random.choice([True, False]):
question = question.upper()
else:
question = question.lower()
if np.random.choice([True, False]):
# add typo: remove characters at random
question = "".join([c for c, v in zip(question, np.random.normal(0, 1, len(question))) if v < 3.0])
if np.random.choice([True, False]):
# add typo: swap characters
n = np.random.randint(len(question) - 1)
question = question[:n] + question[n + 1] + question[n] + question[n + 2 :]
# question marks
if np.random.choice([True, False]):
if question[-1] == "?":
if np.random.choice([True, False]):
question = question[:-1]
else:
for i in range(np.random.randint(5)):
question += "?"
elif np.random.choice([True, False]):
question = question[:-1]
return question
def generate(self, qid: str, zalgo: bool = False, **kwargs):
self._fetch(qid)
if self.language == "en":
return self._generate_en(qid=qid, zalgo=zalgo, **kwargs)
else:
raise NotImplementedError(f"Unknown language: {self.language}")
def _generate_en(
self, qid: str, zalgo: bool = False, pronoun: str = "it", proper: bool = True
) -> str: # it is a proper noun
def _pronoun(which: str) -> tuple:
if which in ("he", "him", "his"):
return "he", "him", "his"
elif which in ("she", "her"):
return "she", "her", "her"
elif which in ("it", "its"):
return "it", "it", "its"
else:
return "they", "them", "their"
def _add_a(name: str) -> str:
if np.random.choice([True, False]):
return f"the {name}"
elif name[0].lower() in ("a", "e", "i", "o", "u"):
return f"an {name}"
else:
return f"a {name}"
sub, obj, pos = _pronoun(pronoun)
# question
Q = {
"P6": [
"Do you know who the prime minister of {name} is?",
"Who is the president of {name}?",
"Who is the governor of {name}?",
],
"P17": [
"Can you tell me Which country {name} is in?",
"Which country is {name} located in?",
"Where is {name} located in the world?",
],
"P19": ["Do you know Where {name} was born at?", "What is {name}'s place of birth?"],
"P20": ["Can you tell me where {name} died?", "Where has {name} died?"],
"P22": ["Do you know who {name}'s father is?", "What is {name}'s father called?", "Who is {name}'s dad?"],
"P25": [
"Tell me who {name}'s mother is.",
"What is {name}'s mother called?",
"Who is {name}'s mom?",
"Who's {name}'s mum?",
],
"P27": [
"Do you have any information on what country {name} is from?",
"Where is {name} from?",
"Where does {name} originate from?",
"What is {name}'s country of origin?",
],
"P30": [
"Do you happen to know what continent {name} is under?",
"Which continent is {name} in?",
"Which continent does {name} belong to?",
],
"P36": [
"Please tell me, what the capital of {name} is?",
"What's {name}'s capital city? Thank you in advance!",
],
"P37": [
"Tell me what the official language of {name} is?",
"What language do they speak in {name}?",
"How do they speak in {name}?",
"What languages they understand in {name}?",
],
"P38": ["Do you know what {name}'s currency is?", "What currency do they use in {name}?"],
"P40": [
"List {name}'s children.",
"Who are {name}'s children?",
"What are the names of {name}'s children?",
"Does {name} have children?",
"How many children does {name} have?",
"Does {name} have any kids?",
"How many children does {name} have?",
],
"P50": ["Give me the name of the author for {name}.", "Who wrote {name}?", "Who's the author for {name}?"],
"P57": [
"Do you know who directed {name}?",
"Who directed {name}?",
"Who is the director of {name}?",
"{name} is directed by whom?",
],
"P61": ["Do you know who invented {name}?", "Who discovered {name}?", "{name} was invented by whom?"],
"P106": [
"List the places {name} works at.",
"Where does {name} work at?",
"What is {name}'s occupation?",
"What does {name} do?",
"What does {name} work?",
"Where does {name} work at?",
"What does {name} work in?",
],
"P138": [
"Describe what {name} was named after.",
"Do you know what {name} was named after?",
"What was {name} named after?",
"Who was {name} named after?",
"Why is {name} called {name}?",
"Why is {name} named like that?",
],
"P169": [
"Tell me who is {name} the CEO of.",
"Who's {name} the CEO of?",
"Which company is {name} the CEO of?",
],
"P170": [
"Tell me more about the creator of {name}.",
"Who crated {name}?",
"Who is {name}'s creator?",
"Who made {name}?",
"Who is responsible for {name}?",
],
"P225": [
"Describe {name} to me in latin.",
"What is {name}'s scientific name?",
"What is {name}'s taxon name?",
"How do you say {name} in latin?",
"What is {name} in latin?",
],
"P246": [
"Tell me {name}'s formula.",
"What is the formula for {name}?",
"What is the chemical formula of {name}?",
"What is the molecular formula of {name}?",
"Which chemical element is {name}?",
"Describe the chemical compound for {name}.",
"What is the chemical symbol for {name}?",
],
"P274": [
"Tell me the chemical formula for {name}.",
"What is the formula for {name}?",
"What is the chemical formula of {name}?",
"What is the molecular formula of {name}?",
"Which chemical element is {name}?",
"Describe the chemical compound for {name}.",
"What is the chemical symbol for {name}?",
],
"P275": [
"Describe {name}'s license.",
"What's {name}'s license?",
"Is {name} copyrighted?",
"Does {name} have a copyright license?",
"What license is associated with {name}?",
],
"P366": [
"Give me use cases for {name}.",
"What's a use-case for {name}?",
"What is {name}'s main use case?",
"How is {name} used?",
"What is {name} good for?",
],
"P487": [
"Say in emoji: {name}.",
"Is there an emoji for {name}?",
"Which unicode character does represent {name}?",
],
"P509": ["Can you tell me how {name} died?", "What did {name} die of?", "What caused {name}'s death?"],
"P527": [
"What are the ingredients of {name}?",
"What are {name}s made of?",
"What are {name}s created from?",
"What are the parts of {name}?",
],
"P569": [
"Do you know when {name} was born?",
"When did {name} born?",
"When was {name} born?",
"When is {name}'s birthday?",
],
"P570": [
"Do you have information on the date of {name}'s death?",
"When did {name} die?",
"Is {name} dead?",
"Is {name} still alive?",
],
"P571": [
"Do you have information on when {name} was first created? Thanks!",
"When was {name} created?",
"When was {name} first released?",
],
"P575": [
"Please tell me when {name} was first discovered. Thank you!",
"When was {name} invented?",
"What was the date when {name} was finally discovered?",
],
"P576": [
"Do you know when {name} was discontinued?",
"When was {name} demolished?",
"At what time was {name} dissolved?",
],
"P580": [
"Can you recall when {name} started?",
"When did {name} start?",
"What was the starting date for {name}?",
"When did {name} break out?",
],
"P582": [
"Do you have information on the date when {name} ended?",
"When did {name} end?",
"What was the ending date of {name}?",
"When was {name} finally over?",
],
"P625": [
"Give me the coordinates for {name}!",
"Locate {name}.",
"What is {name}'s location?",
"Where can I find {name}?",
"What are the GPS coordinates for {name}?",
],
"P837": [
"Tell me when {name} is!",
"When is {name} celebrated?",
"On which day is {name}?",
"When is {name} day?",
],
"P856": [
"Give me the URL for {name}.",
"What's the URL for {name}? Thanks!",
"What's {name}'s website?",
"What is the official website for {name}?",
"Can you tell me the link to {name}?",
],
"P973": [
"Return the URL for {name}!",
"Where can I find more information on {name}?",
"Where can I read more abou {name} online?",
"Is there a site that explains {name} in detail?",
],
"P1082": [
"Count the number of people who live in {name}!",
"What is {name}'s population?",
"What is the population of {name}?",
"How many people live in {name}?",
],
"P1120": [
"Do you know the number of people who died in {name}?",
"How many people have died due to {name}?",
"How many people have lost their lives in {name}?",
"What is the number of fatalities after {name}?",
"How many people have lost their lives in {name}?",
],
"P2043": ["Calculate the length of {name}!", "How long is {name}?", "What is {name}'s length?"],
"P2044": [
"Do you know how tall {name} is?",
"How tall is {name}?",
"How high is {name}?",
"How many meters is {name} above sea level?",
"What is {name}'s elevation?",
],
"P2046": ["Is {name} big?", "How big is {name}?", "What is the area of {name}?", "How big is {name}?"],
"P2049": [
"Describe the width of {name}.",
"What's {name}'s width?",
"How wide is {name}?",
"What's the width of {name}?",
],
"P2250": [
"Do you know how long {name} lives?",
"What is the life expectancy of {name}?",
"How long do {name}s live?",
],
"P2283": [
"Describe {name} in detail.",
"How does {name} work?",
"What makes {name} work in theory? Thanks for the answer!",
],
"P3063": ["I need information on the gestation period of {name}s.", "How long are {name}s pregnant?"],
"P3373": [
"List {name}'s siblings.",
"Who are {name}'s siblings?",
"What are the names of {name}'s brothers and sisters?" "Does {name} have any siblings?",
"Does {name} have a brother or sister?",
"How many siblings does {name} have?",
"How many brothers and sisters does {name} have?",
],
"P4511": ["Calculate the depth of {name}!", "How deep is {name}?", "What is {name}'s vertical depth?"],
"P4733": [
"Do you know the noise {name} makes?",
"What does {name} say?",
"What sound does {name} make?",
"How does {name} sound like?",
],
"P7767": ["How would you serve {name} for me?", "How are {name}s served?"],
}
# reference to name in question
Qp = {
"P6": ["Tell me who {pos} governor is!", "Who's the governor?", "Who is {pos} president?"],
"P17": [
"Do you know which country is that in?",
"Which country is {sub} in?",
"Under which country is {sub} located?",
],
"P19": [
"Can you tell me the place {sub} was born at? Thanks!",
"Where was {sub} born?",
"What is {pos} place of birth?",
],
"P20": [
"Where di {sub} die, can you tell me that?",
"Where did {sub} die?",
"What is the place of {pos} death?",
],
"P22": [
"Who is {pos} father, respond with his name. Thank you.",
"What is {pos} father called?",
"Who is {pos} dad?",
"What's {pos} father's name?",
],
"P25": [
"Who is {pos} mother, respond with her name.",
"What is {pos} mother called?",
"Who is {pos} mom?",
"Who's {pos} mum?",
"What's {pos} mother's name?",
],
"P27": [
"Can you tell me where {sub} came from?",
"Where is {sub} from?",
"Where does {sub} come from?",
"Where does {sub} originate from?",
],
"P30": [
"Tell me the name of the continent {sub} is in.",
"Which continent is {sub} in?",
"Which continent does {sub} belong to?",
],
"P36": ["Do you know {pos} capital?", "What is {pos} capital called?", "What's the name of {pos} capital?"],
"P37": [
"Describe {pos} official language.",
"What is {pos} official language?",
"What language do they speak there?",
],
"P38": [
"Tell me more about {pos} currency.",
"What is {pos} currency?",
"Which currencies are used there?",
],
"P40": [
"I need more information on {pos} children.",
"Who are {pos} children?",
"What are the names of {pos} kids?",
"How many children does {sub} have?",
"Does {sub} have kids?",
"Does {sub} have any children?",
"How many kids {sub} got?",
],
"P50": [
"Please, describe {pos} author.",
"Who wrote {obj}?",
"Who's {pos} author?",
"Who {pos} author is?",
],
"P57": [
"Who is {obj} director, do you have information on that in your database?",
"Who directed {obj}?",
"Who is {pos} director?",
],
"P61": ["Tell me who invented {obj}!", "Do you know who discovered {obj} first?"],
"P106": [
"Do you have data on {pos} jobs?",
"Where does {sub} work at?",
"What does {sub} do for a living?",
"What's {pos} job?",
"What is {pos} occupation?",
],
"P138": [
"Explain how {sub} got {pos} name!",
"How did {sub} get {pos} name?",
"Where did {sub} get {pos} name from?",
"Why is {sub} called {name}?",
],
"P169": [
"Give me information on the companies {sub} is the CEO at.",
"Is {sub} the CEO of a company?",
"Which company is {sub} the CEO of?",
],
"P170": [
"Who made {obj}, can you tell me?",
"Who crated {obj}?",
"Who is {pos} creator?",
"Who made {obj}?",
],
"P225": [
"Translate {pos} name to latin.",
"What is {pos} scientific name?",
"How do you call {obj} in latin?",
"How to say {obj} in latin?",
],
"P246": [
"Give me {pos} formula.",
"What is {pos} formula?",
"What is {pos} chemical formula?",
"What is {pos} molecular formula?",
"Which chemical element is {sub}?",
"Describe the chemical compound for {obj}.",
"What is the chemical symbol for {obj}?",
],
"P274": [
"Describe {pos} chemical formula!",
"What is {pos} formula?",
"What is {pos} chemical formula?",
"What is {pos} molecular formula?",
"Which chemical element is {sub}?",
"Describe the chemical compound for {obj}.",
"What is the chemical symbol for {obj}?",
],
"P275": [
"Do you know which license {pos} is under?",
"What's {pos} license?",
"Is {sub} copyrighted?",
"What license was {sub} released under?",
],
"P366": [
"And {pos} use cases are?",
"What is {pos} main use case?",
"How is {sub} used?",
"What is {sub} good for?",
"What does {sub} do?",
],
"P487": [
"Write {obj} down using emojis only.",
"Does {sub} have an emoji?",
"Is there a unicode character for {sub}?",
],
"P509": [
"Can you tell me hat {sub} died of?",
"What did {sub} die of?",
"What was the cause of {pos} death?",
],
"P527": [
"List {pos} parts.",
"What are {pos} ingredients?",
"What are they made of?",
"What are their parts?",
],
"P569": ["{pos} birthday is?", "When did {sub} born?", "When was {sub} born?", "When is {pos} birthday?"],
"P570": ["Is {sub} dead?", "When did {sub} die?", "Is {sub} dead?", "Did {sub} die?"],
"P571": [
"Do you know the date of {pos} inception?",
"When was {sub} first released?",
"And when was {sub} actually created?",
],
"P575": ["Tell me the date of {pos} discovery!", "When was {sub} invented then?"],
"P576": [
"Can you tell me the date {sub} was dinally discontinued?",
"When was {sub} demolished?",
"At what time was {sub} dissolved?",
],
"P580": ["Write down the exact date {sub} started!", "When did {sub} start?", "When did {sub} break out?"],
"P582": ["Write down the exact date {sub} ended.", "When was {sub} finally over?", "When did {sub} end?"],
"P625": ["I need {pos} GPS location!", "What is {pos} GPS location?", "What are {pos} coordinates?"],
"P837": ["When would you celebrate {obj}?", "When is {sub} celebrated?", "On which day is {sub}?"],
"P856": [
"Send me {pos} web address.",
"What's the address of {pos} website?",
"What is {pos} official website?",
"Can you tell me the link to {obj}?",
],
"P973": [
"Can you give me more information on {obj}?",
"Where can I find more info on {obj}?",
"Where can I read more about {obj} online?",
"Is there a site that explains {obj} in detail?",
],
"P1082": ["Estimate {pos} population.", "How many people live there?", "How large is {pos} population?"],
"P1120": [
"Estimate the number of people who died in the event!",
"How many people died?",
"How deadly was {sub}?",
"How many fatalities were there?",
],
"P2043": ["Do you know {pos} length?", "How long is {sub}?", "What'S the length of {obj}?"],
"P2044": [
"Is {sub} tall?",
"How tall is {sub}?",
"How high is {sub}?",
"Is {name} above sea level?",
"What is {pos} elevation?",
],
"P2046": ["Provide information on {pos} area.", "How big is {sub}?", "How big is {pos} area?"],
"P2049": ["Calculate {pos} width!", "How wide is {sub}?", "What is {pos} width?"],
"P2250": ["Can you tell me how long {sub} live?", "What is {pos} life expectancy?", "How long they live?"],
"P2283": [
"Tell me how {sub} works under the hood.",
"Do you know how {sub} works?",
"How does {sub} work under the hood?",
"How does {obj} run?",
],
"P3063": ["So how long is their gestation period?", "How long do they stay pregnant?"],
"P3373": [
"List {pos} siblings please.",
"Who are {pos} brothers and sisters?",
"What are the names of {pos} siblings?",
"Does {sub} have any siblings?",
"Does {sub} have a brother or sister?",
"How many siblings does {sub} have?",
"How many brothers and sisters does {sub} have?",
],
"P4511": ["Do you know if {sub} is really deep?", "Is {sub} deep?", "How deep is {sub} really?"],
"P4733": ["Mimic {pos} sound!", "What sound does {sub} make?", "How does {sub} sound like?"],
"P7767": ["Would you serve {obj} hot or cold?", "How are they usually served?"],
}
# single answer
A = {
"P6": ["{name}'s president is {a}.", "{name}'s prime minister is {a}."],
"P17": ["{name} is located in {a}.", "{name} is found in the country of {a}."],
"P19": ["{name} was born in {a}.", "{sub} was born in {a}.", "In {a}."],
"P20": ["{name} died in {a}.", "{sub} died in {a}."],
"P22": ["{name}'s father is {a}.", "{pos} father is called {a}.", "His name is {a}.", "It's {a}."],
"P25": ["{name}'s mother is {a}.", "{pos} mother is called {a}.", "Her name is {a}.", "It's {a}."],
"P27": ["{name} originates from {a}.", "{sub} is from {a}.", "{name} comes from {a}."],
"P30": ["{name} is part of {a}.", "{name} is part of the continent of {a}."],
"P36": ["{name}'s capital city is {a}.", "{pos} capital is called {a}.", "The capital of {name} is {a}."],
"P37": ["The official language of {name} is {a}.", "The people in {name} speak {a}."],
"P38": [
"{name}'s currency is the {a}.",
"{name} uses {a} as their currency.",
"The currency of {name} is the {a}.",
],
"P40": [
"{name} has one child named {a}.",
"{name} has a single child named {a}.",
"{sub} has a child named {a}.",
],
"P50": ["{name} was written by {a}.", "The author of {name} is {a}.", "{a} is {pos} author."],
"P57": ["{name} was directed by {a}.", "{sub} was directed by {a}."],
"P61": ["{name} was discovered by {a}.", "{sub} was discovered by {a}."],
"P106": ["{name} works at {a}.", "{sub} works at {a}.", "{name} job title is {a}."],
"P138": ["{name} was named after {a}.", "{name} got {pos} name from {a}.", "{pos} name comes from {a}."],
"P169": ["{name} is the CEO of {a}.", "{sub} is the CEO of {a}."],
"P170": [
"{sub} was created by {a}.",
"{a} created {name}.",
"The creator of {name} is {a}.",
"{a} made {obj}.",
"{sub} was created by {a}.",
],
"P225": ["{name} is called {a} in latin.", "The scientific term for {name} is {a}."],
"P246": ["The element of {name} is {a}.", "The symbol for {name} is {a}."],
"P274": ["The formula for {name} is {a}.", "The chemical formula of {name} is {a}."],
"P275": [
"{name} has the following license: {a}.",
"{name} has a {a} license associated with {obj}.",
"{name} was released under {a}.",
"{sub} is licensed under {a}.",
],
"P366": [
"{name} is most commonly used for {a}.",
"{sub} is used mostly for {a}.",
"{name} is mostly known for {a}.",
],
"P487": ["{a}", "The {name} emoji is {a}.", "The {a} character repesents {name}."],
"P509": ["{name} died of {a}.", "The cause of {pos} death was {a}."],
"P527": ["{name} are made of {a}.", "They are made of {a}."],
"P569": ["{name} was born on {a}.", "{pos} birthday is on the {a}."],
"P570": ["{name} died at {a}", "{sub} died in {a}."],
"P571": [
"{name} was created in {a}.",
"The date of {pos} inception is {a}.",
"{name} was first released in {a}.",
],
"P575": ["{name} was invented at {a}.", "{name} was discovered in {a}."],
"P576": [
"{name} was discontinued after {a}.",
"{name} was demolished by {a}.",
"{sub} got dissolved at {a}.",
],
"P580": ["{name} started in {a}.", "{name} first started at {a}."],
"P582": ["{name} ended in {a}.", "{name} lasted until {a}."],
"P625": ["{name} is lcoated at {a}.", "The coordinates for {name} are {a}.", "{pos} GPS location is {a}."],
"P837": ["{name} is celebrated on {a}.", "{name} is on {a}."],
"P856": [
"The URL for {name} is: {a}",
"See {a}",
"The URL of {pos} webiste is {a}",
"{pos} web address is: {a}",
],
"P973": [
"You can find out more at {a}",
"Here's a link on {name}: {a}",
"You can find out more about {obj} on {a}",
],
"P1082": [
"{name}'s population is {a}.",
"Around {a} people live in {name}.",
"{pos} population is estimated to be around {a}.",
],
"P1120": [
"The number of deaths was {a}.",
"The number of fatalities was {a}.",
"{a} died due to {name}.",
"{name} has taken the lives of {a}.",
],
"P2043": ["{name} is {a} long.", "{sub} has a length of {a}."],
"P2044": ["{name} is {a} tall.", "{name} is {a} above sea level.", "{pos} elevation is {a}."],
"P2046": ["{name}'s area is {a}", "{pos} area is {a}."],
"P2049": ["{name}'s widht is {a}.", "{name} is {a} wide."],
"P2250": ["{name} have a life expectancy of {a}.", "{pos} life expectancy is about {a}."],
"P2283": [
"{name} uses {a} to work.",
"{sub} works via {a}.",
"{name} works through {a}.",
"{sub} makes use of {a}.",
],
"P3063": [
"The gestation period for {name}s is {a}.",
"The amount of time needed for their gestation period is known to be {a}.",
],
"P3373": ["{name} has a siblings called {a}.", "{sub} has a sibling named {a}."],
"P4511": ["{name} has a depth of {a}.", "{name} can be as deep as {a}.", "{pos} vertical depth is {a}."],
"P4733": ["{name} makes the following sound: {a}", "{name} makes a {a} sound.", "The {name} says {a}."],
"P7767": ["{name}s are served {a}.", "{name} is usually served {a}."],
}
# plural / multiple answers
Ap = {
"P6": ["The governors of {name} are {a}.", "The ministers of {name} are {a}."],
"P37": ["The official languages of {name} are {a}.", "They speak {a}."],
"P38": [
"{name} accepts {a}.",
"{name} uses {a} as their countriy's currencies.",
"The currencies of {name} are {a}.",
],
"P40": [
"{name} has {l} children: {a}.",
"The number of children {name} has is {l}. Their names are {a}.",
"{pos} {l} children are {a}.",
],
"P50": ["{name} was co-written by {a}.", "The authors of {name} are {a}."],
"P57": ["{name} was direcrted by the following people: {a}.", "{a} were the directors of {name}."],
"P61": ["{pos} inventors are {a}.", "{name} was discovered by {a}."],
"P106": ["{name} has multiple occupations: {a}.", "{name}'s job titles are: {a}."],
"P169": ["{name} is the CEO of multiple companies, such as {a}.", "{sub} is the CEO at {a}."],
"P225": ["The taxon names for {name} are {a}.", "The proper scientific terms for {name} are {a}."],
"P246": ["The elements of {name} are {a}.", "The symbols for {name} are {a}."],
"P274": ["The formulas for {name} are {a}.", "The chemical formulas of the compound {name} are {a}."],
"P487": ["The {name} emojis are {a}.", "The characters {a} repesent {name}."],
"P527": ["The ingredients of {name} are {a}.", "{a} are all parts needed for {name}."],
"P575": [
"Sources disagree on the exact date, it is said that {name} was invented in {a}.",
"{name} was discovered multiple times at {a}.",
],
"P856": ["The URLs for {name} are: {a}", "See {a}", "The URLs of {pos} webiste are {a}"],
"P625": [
"{name} can be found under the following GPS locations: {a}.",
"The coordinates for {name} are {a}.",
],
"P973": ["You can find out more at {a}", "You can find out more about {obj} at {a}"],
"P1120": [
"There are multiple sources on the number of fatalities: {a}",
"{name} is know to take the lives of somewhere between {a}.",
],
"P1082": [
"There are multiple sources on {pos} population: {a}.",
"There are different sources on {name}'s population: {a}.",
],
"P2046": ["{name}'s area has changed over time: {a}", "{pos} area has altered over the ages to {a}."],
"P3373": [
"{name} has {l} siblings: {a}.",
"The number of brothers and sisters {name} has is {l}. Their names are {a}.",
],
"P4733": ["{name} makes sounds like {a}.", "The sounds {sub} often makes are {a}."],
}
assert len(Q.keys()) == len(A.keys())
largest = int(sorted(self.cache[self.language][qid].keys())[-1])
qs = [key for key in Q.keys() if key in self.cache[self.language][qid][largest]["graph"]]
if not qs:
return ""
np.random.shuffle(qs)
if np.random.choice([True, False]):
if np.random.choice([True, False]):
results = [f'Questions about {self.cache[self.language][qid][largest]["desc"]["label"]}:']
else:
results = [
f'Questions and Answers on {self.cache[self.language][qid][largest]["desc"]["label"]}, {self.cache[self.language][qid][largest]["desc"]["description"]}:'
]
else:
if np.random.choice([True, False]):
results = [
f'Questions about {self.cache[self.language][qid][largest]["desc"]["label"]} ({self.cache[self.language][qid][largest]["desc"]["description"]}):'
]
else:
results = [
f'Questions and Answers on {self.cache[self.language][qid][largest]["desc"]["label"]} (also known as {", ".join(self.cache[self.language][qid][largest]["desc"]["aliases"])}):'
]
for i, key in enumerate(qs):
if np.random.choice([True, False]):
name = self.cache[self.language][qid][largest]["desc"]["label"]
else:
name = np.random.choice(self.cache[self.language][qid][largest]["desc"]["aliases"])
if not proper:
name = _add_a(name)
if i == 0 or np.random.choice([True, False]):
question = np.random.choice(Q[key]).format(name=name, sub=sub, obj=obj, pos=pos)
else:
question = np.random.choice(Qp[key] if key in Qp else Q[key]).format(
name=name, sub=sub, obj=obj, pos=pos
)
if zalgo:
question = self._zalgo(question)
a = self.cache[self.language][qid][largest]["graph"][key]
l = len(a)
if key not in Ap or l <= 1:
if l <= 1:
a = a[0]
else:
a = ", ".join(a[:-1]) + f" and {a[-1]}"
answer = np.random.choice(A[key]).format(q=question, name=name, sub=sub, obj=obj, pos=pos, a=a, l=l)
else:
a = ", ".join(a[:-1]) + f" and {a[-1]}"
answer = np.random.choice(Ap[key]).format(q=question, name=name, sub=sub, obj=obj, pos=pos, a=a, l=l)
results.append(f"Q: {question}\r\nA: {answer[0].upper()}{answer[1:]}")
return "\n\n".join(results)
wg = WikiGraph(file="cache.tmp") # will save cached graph to cache.tmp
# search for QID
wg.search("chatgpt")
# chatgpt
print(wg.generate(qid="Q115564437"))
# bill gates
print(wg.generate("Q5284", pronoun="he"))
# budapest
print(wg.generate("Q1781", zalgo=True))
# hamburger
print(wg.generate("Q6663", proper=False))