examples/00_quick_start/embdotbias_movielens.ipynb
<i>Copyright (c) Recommenders contributors.</i>
<i>Licensed under the MIT License.</i>
This notebook shows how to use EmbeddingDotBias similar to EmbeddingDotBias from FastAI but directly using Pytorch. This will create an embedding for the users and the items.
# Suppress all warnings
import warnings
warnings.filterwarnings("ignore")
import os
import sys
import logging
import numpy as np
import pandas as pd
import torch
from tempfile import TemporaryDirectory
from recommenders.utils.constants import (
DEFAULT_USER_COL as USER,
DEFAULT_ITEM_COL as ITEM,
DEFAULT_RATING_COL as RATING,
DEFAULT_TIMESTAMP_COL as TIMESTAMP,
DEFAULT_PREDICTION_COL as PREDICTION
)
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import (exp_var, mae, map,
ndcg_at_k,
precision_at_k,
recall_at_k, rmse,
rsquared)
from recommenders.models.embdotbias.data_loader import RecoDataLoader
from recommenders.models.embdotbias.model import EmbeddingDotBias
from recommenders.models.embdotbias.training_utils import (Trainer,
predict_rating)
from recommenders.models.embdotbias.utils import cartesian_product, score
from recommenders.utils.notebook_utils import store_metadata
from recommenders.utils.timer import Timer
logging.basicConfig(level=logging.INFO, format="%(levelname)s - %(message)s")
print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"CuDNN Enabled: {torch.backends.cudnn.enabled}")
Defining some constants to refer to the different columns of our dataset.
# top k items to recommend
TOP_K = 10
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = "100k"
# Model parameters
N_FACTORS = 40
EPOCHS = 7
SEED = 101
ratings_df = movielens.load_pandas_df(
size=MOVIELENS_DATA_SIZE,
header=[USER,ITEM,RATING,TIMESTAMP]
)
# Make sure the IDs are loaded as strings to better prevent confusion with embedding ids
ratings_df[USER] = ratings_df[USER].astype("str")
ratings_df[ITEM] = ratings_df[ITEM].astype("str")
ratings_df.head()
# Split the dataset
train_valid_df, test_df = python_stratified_split(
ratings_df,
ratio=0.75,
min_rating=1,
filter_by="item",
col_user=USER,
col_item=ITEM,
seed=SEED
)
train_valid_df
# Remove "cold" users from test set
test_df = test_df[test_df[USER].isin(train_valid_df[USER])]
# Fix random seeds to make sure the runs are reproducible
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
data = RecoDataLoader.from_df(
train_valid_df,
user_name=USER,
item_name=ITEM,
rating_name=RATING,
valid_pct=0.1
)
data.show_batch()
We will be using 40 latent factors. This will create an embedding for the users and the items that will map each of these to 40 floats as can be seen below. Note that the embedding parameters are not predefined, but are learned by the model.
Although ratings can only range from 1-5, we are setting the range of possible ratings to a range from 0 to 5.5 -- that will allow the model to predict values around 1 and 5, which improves accuracy. Lastly, we set a value for weight-decay for regularization.
model = EmbeddingDotBias.from_classes(
n_factors=N_FACTORS,
classes=data.classes,
user=USER,
item=ITEM,
y_range=[0,5.5]
)
Now train the model for 7 epochs setting the maximal learning rate. The learner will reduce the learning rate with each epoch using cosine annealing.
trainer = Trainer(model=model)
with Timer() as train_time:
trainer.fit(data.train, data.valid, EPOCHS)
print(f"Took {train_time} seconds for training.")
Save the learner so it can be loaded back later for inferencing / generating recommendations
tmp = TemporaryDirectory()
model_path = os.path.join(tmp.name, "embdotbias_model.pth")
torch.save(model.state_dict(), model_path)
print(f"Model saved to: {model_path}")
Load the learner from disk.
loaded_model = EmbeddingDotBias.from_classes(
n_factors=N_FACTORS,
classes=data.classes,
user=USER,
item=ITEM,
y_range=[0,5.5]
)
# Load the state dictionary
loaded_model.load_state_dict(torch.load(model_path))
# Set the model to evaluation mode
loaded_model.eval()
print("Model loaded successfully.")
Get all users and items that the model knows
# Total items & users
total_items = loaded_model.classes[ITEM][1:]
total_users = loaded_model.classes[USER][1:]
Get all users from the test set and remove any users that were not known in the training set
test_users = test_df[USER].unique()
test_users = np.intersect1d(test_users, total_users)
Example prediction
first_batch = next(iter(data.train))
user_idx = first_batch[0][0, 0].item()
user_id = data.classes[USER][user_idx]
item_idx = first_batch[0][0, 1].item()
item_id = data.classes[ITEM][item_idx]
print(f"User ID: {user_id}, Item ID: {item_id}")
try:
user_embeddings = loaded_model.weight([user_id, item_id], is_item=False)
predicted_rating = predict_rating(loaded_model, user_id, item_id)
print(f"Predicted rating for user {user_id} and item {item_id}: {predicted_rating}")
except KeyError as e:
print(f"Error: {e}")
Build the cartesian product of test set users and all items known to the model
users_items = cartesian_product(np.array(test_users),np.array(total_items))
users_items = pd.DataFrame(users_items, columns=[USER,ITEM])
users_items
Lastly, remove the user/items combinations that are in the training set -- we don't want to propose a movie that the user has already watched.
users_items_candidates = pd.merge(users_items, train_valid_df.astype(str), on=[USER, ITEM], how="left")
users_items_candidates = users_items_candidates[users_items_candidates[RATING].isna()][[USER, ITEM]]
users_items_candidates
top_k_scores = score(
loaded_model,
test_df=users_items_candidates,
user_col=USER,
item_col=ITEM,
prediction_col=PREDICTION
)
top_k_scores
Calculate some metrics for our model
eval_map = map(test_df, top_k_scores, col_user=USER, col_item=ITEM,
col_rating=RATING, col_prediction=PREDICTION,
relevancy_method="top_k", k=TOP_K)
eval_ndcg = ndcg_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM,
col_rating=RATING, col_prediction=PREDICTION,
relevancy_method="top_k", k=TOP_K)
eval_precision = precision_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM,
col_rating=RATING, col_prediction=PREDICTION,
relevancy_method="top_k", k=TOP_K)
eval_recall = recall_at_k(test_df, top_k_scores, col_user=USER, col_item=ITEM,
col_rating=RATING, col_prediction=PREDICTION,
relevancy_method="top_k", k=TOP_K)
print("Model:\t\t" + model.__class__.__name__,
"Top K:\t\t%d" % TOP_K,
"MAP:\t\t%f" % eval_map,
"NDCG:\t\t%f" % eval_ndcg,
"Precision@K:\t%f" % eval_precision,
"Recall@K:\t%f" % eval_recall, sep='\n')
The above numbers are lower than SAR, but expected, since the model is explicitly trying to generalize the users and items to the latent factors. Next look at how well the model predicts how the user would rate the movie. Need to score test_df user-items only.
scores = score(
model,
test_df=test_df,
user_col=USER,
item_col=ITEM,
prediction_col=PREDICTION
)
Now calculate some regression metrics
eval_r2 = rsquared(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_rmse = rmse(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_mae = mae(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
eval_exp_var = exp_var(test_df, scores, col_user=USER, col_item=ITEM, col_rating=RATING, col_prediction=PREDICTION)
print("Model:\t\t\t" + model.__class__.__name__,
"RMSE:\t\t\t%f" % eval_rmse,
"MAE:\t\t\t%f" % eval_mae,
"Explained variance:\t%f" % eval_exp_var,
"R squared:\t\t%f" % eval_r2, sep='\n')
That RMSE is competitive in comparison with other models.
# Record results for tests - ignore this cell
store_metadata("map", eval_map)
store_metadata("ndcg", eval_ndcg)
store_metadata("precision", eval_precision)
store_metadata("recall", eval_recall)
store_metadata("rmse", eval_rmse)
store_metadata("mae", eval_mae)
store_metadata("exp_var", eval_exp_var)
store_metadata("rsquared", eval_r2)
store_metadata("train_time", train_time.interval)