examples/01_prepare_data/wikidata_knowledge_graph.ipynb
Many recommendation algorithms (DKN, RippleNet, KGCN) use Knowledge Graphs (KGs) as an external source of information. We found that one of the bottlenecks to benchmark current algorithms like DKN, RippleNet or KGCN is that they used Microsoft Satori. As Satori is not open source, it's not possible to replicate the results found in the papers. The solution is using other open source KGs.
The goal of this notebook is to provide examples of how to interact with Wikipedia queries and Wikidata to extract a Knowledge Graph that can be used with the mentioned algorithms.
The steps covered are:
import sys
import logging
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from recommenders.datasets import movielens
from recommenders.datasets.wikidata import (search_wikidata,
find_wikidata_id,
query_entity_links,
read_linked_entities,
query_entity_description)
from recommenders.utils.notebook_utils import store_metadata
print(f"System version: {sys.version}")
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = "100k"
MOVIELENS_SAMPLE = True
MOVIELENS_SAMPLE_SIZE = 10
logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s",
level=logging.WARNING, # Set to logging.ERROR for less verbose
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)]
)
names = ["The Godfather", "Al Pacino", "Tom Hanks", "Forrest Gump", "Julia Roberts", "", "My Best Friend's Wedding"]
%%time
# the following code has been wrapped in a helper function called search_wikidata()
# it is provided here to show the details of which calls are being made to wikipedia APIs
# capture results as a list of dicts to transform to DataFrame (this is faster than appending to DataFrames)
results_list = []
for idx, name in enumerate(names):
# first get the wikipedia entity_id for each name
entity_id = find_wikidata_id(name)
if entity_id == "entityNotFound":
continue
# next we query wikipedia to get entity links
json_links = query_entity_links(entity_id)
# the following function extracts entities from the links
related_links = read_linked_entities(json_links)
# now we can construct an connection in our graph between two entities
for related_entity, related_name in related_links:
result = dict(
name=name,
original_entity=entity_id,
linked_entities=related_entity,
name_linked_entities=related_name,
)
results_list.append(result)
results_list = pd.DataFrame(results_list)
results_list.head()
G = nx.from_pandas_edgelist(results_list, "original_entity", "linked_entities")
target_names = results_list[["linked_entities", "name_linked_entities"]].drop_duplicates().rename(columns={"linked_entities": "labels", "name_linked_entities": "name"})
source_names = results_list[["original_entity", "name"]].drop_duplicates().rename(columns={"original_entity": "labels"})
names = pd.concat([target_names, source_names])
names = names.set_index("labels")
names = names.to_dict()["name"]
plt.figure(figsize=(12,12))
pos = nx.spring_layout(G)
nx.draw(G,pos, node_size=60,font_size=9, width = 0.2)
nx.draw_networkx_labels(G, pos, names, font_size=9)
plt.show()
# Obtain pairs of Movie Title - IDs from Movielens
df = movielens.load_pandas_df(
MOVIELENS_DATA_SIZE,
("UserId", "ItemId", "Rating", "Timestamp"),
title_col="Title",
genres_col="Genres",
year_col="Year",
)
movies = df[["Title", "ItemId"]].drop_duplicates().reset_index()
movies["Title"][0:5]
# For notebook testing
if MOVIELENS_SAMPLE:
movies = movies.head(MOVIELENS_SAMPLE_SIZE)
movies.shape
%%time
names = [t + " film" for t in movies["Title"]]
result = search_wikidata(names, extras=movies[["Title", "ItemId"]].to_dict())
result.head()
result["Title"].value_counts()
# result.to_csv("movielens_" + MOVIELENS_DATA_SIZE + '_wikidata.csv', index = False)
number_movies = len(result["Title"].unique())
print(f"Number of movies: {number_movies}")
# Record results for tests - ignore this cell
store_metadata("length_result", number_movies)