example/recommenders/demo1-MF.ipynb
Demonstrates matrix factorization with MXNet on the MovieLens 100k dataset. We perform collaborative filtering, where the recommendations are based on previous rating of users.
We are trying to learn embeddings for users and movies, based on user partial ratings of movies, to estimate future movie ratings
For more deep learning based architecture for recommendation, refer to this survey: Deep Learning based Recommender System: A Survey and New Perspectives
import matplotlib.pyplot as plt
import mxnet as mx
from mxnet import gluon, np, npx, autograd
import numpy as onp
from matrix_fact import train
from movielens_data import get_dataset, max_id
ctx = [mx.gpu(0)] if mx.device.num_gpus() > 0 else [mx.cpu()]
batch_size = 128
train_dataset, test_dataset = get_dataset()
max_user, max_item = max_id('./ml-100k/u.data')
(max_user, max_item)
train_data = gluon.data.DataLoader(train_dataset, shuffle=True, last_batch='rollover', batch_size=batch_size, num_workers=0)
test_data = gluon.data.DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=0)
for user, item, score in test_data:
print(user[0], item[0], score[0])
break
class LinearMatrixFactorization(gluon.HybridBlock):
def __init__(self, k, max_user=max_user, max_item=max_item):
super(LinearMatrixFactorization, self).__init__()
# user feature lookup
self.user_embedding = gluon.nn.Embedding(input_dim=max_user, output_dim = k)
# item feature lookup
self.item_embedding = gluon.nn.Embedding(input_dim=max_item, output_dim = k)
def forward(self, user, item):
user_embeddings = npx.relu(self.user_embedding(user))
items_embeddings = npx.relu(self.item_embedding(item))
# predict by the inner product, which is elementwise product and then sum
pred = (user_embeddings * items_embeddings).sum(axis=1)
return pred.flatten()
net1 = LinearMatrixFactorization(64)
net1.initialize(mx.init.Xavier(), ctx=ctx)
mx.viz.plot_network(net1(mx.sym.var('user'), mx.sym.var('item')), node_attrs={"fixedsize":"false"})
net1.summary(user.to_device(ctx[0]), item.to_device(ctx[0]))
losses_1 = train(net1, train_data, test_data, epochs=15, learning_rate=1, ctx=ctx)
losses_1
The optimizer used for training and hyper-parameter influence greatly how fast the model converge. We can try with the Adam optimizer which will often converge much faster than SGD without momentum as we used before. You should see this model over-fitting quickly.
net1 = LinearMatrixFactorization(64)
net1.initialize(mx.init.Xavier(), ctx=ctx)
losses_1_adam = train(net1, train_data, test_data, epochs=15, optimizer='adam', learning_rate=0.01, ctx=ctx)
ratings = np.dot(net1.user_embedding.weight.data(ctx=ctx[0]), net1.item_embedding.weight.data(ctx=ctx[0]).T).asnumpy()
ratings.shape
# Helper function to print the recommendation matrix
# And the top 5 movies in several categories
def evaluate_embeddings(ratings):
plt.figure(figsize=(15,15))
plt.xlabel('items')
plt.ylabel('users')
plt.title('Users estimated ratings of items sorted by mean ratings across users')
im = plt.imshow(((ratings[:, ratings.mean(axis=0).argsort()[::-1]])))
cb = plt.colorbar(im,fraction=0.026, pad=0.04, label="score")
top_5_movies = ratings.mean(axis=0).argsort()[::-1][:5] # Highest mean projected rating
worst_5_movies = ratings.mean(axis=0).argsort()[:5] # Lowest mean projected rating
top_5_controversial = ratings.std(axis=0).argsort()[::-1][:5] # With most variance
with open('ml-100k/u.item', 'rb') as f:
movies = f.readlines()
print("Top 5 movies:")
for movie in top_5_movies:
print("{}, average rating {:.2f}".format(str(movies[int(movie)-1]).split("|")[1], ratings.mean(axis=0)[movie]))
print("\nWorst 5 movies:")
for movie in worst_5_movies:
print("{}, average rating {:.2f}".format(str(movies[int(movie)-1]).split("|")[1], ratings.mean(axis=0)[movie]))
print("\n5 most controversial movies:")
for movie in top_5_controversial:
print("{}, average rating {:.2f}".format(str(movies[int(movie)-1]).split("|")[1], ratings.mean(axis=0)[movie]))
evaluate_embeddings(ratings)
We can observe that some movies tend to be widely recommended or not recommended, whilst some other have more variance in their predicted score
We don't have to limit ourselves to the weights of the linear embedding layer for our user or item embeddings. We can have a more complex pipeline combining fully connected layers and non-linear activations.
class MLPMatrixFactorization(gluon.HybridBlock):
def __init__(self, k, hidden, max_user=max_user, max_item=max_item):
super(MLPMatrixFactorization, self).__init__()
# user feature lookup
self.user_embedding = gluon.nn.Embedding(input_dim=max_user, output_dim = k)
self.user_mlp = gluon.nn.Dense(hidden)
# item feature lookup
self.item_embedding = gluon.nn.Embedding(input_dim=max_item, output_dim = k)
self.item_mlp = gluon.nn.Dense(hidden)
def forward(self, user, item):
user_embeddings = self.user_embedding(user)
user_embeddings_relu = npx.relu(user_embeddings)
user_transformed = self.user_mlp(user_embeddings_relu)
items_embeddings = self.item_embedding(item)
items_embeddings_relu = npx.relu(items_embeddings)
items_transformed = self.item_mlp(items_embeddings_relu)
# predict by the inner product, which is elementwise product and then sum
pred = (user_transformed * items_transformed).sum(axis=1)
return pred.flatten()
net2 = MLPMatrixFactorization(64, 64)
net2.initialize(mx.init.Xavier(), ctx=ctx)
mx.viz.plot_network(net2(mx.sym.var('user'), mx.sym.var('item')), node_attrs={"fixedsize":"false"})
net2.summary(user.to_device(ctx[0]), item.to_device(ctx[0]))
losses_2 = train(net2, train_data, test_data, epochs=15, ctx=ctx)
We can try training with the Adam optimizer instead
net2 = MLPMatrixFactorization(64, 64)
net2.initialize(mx.init.Xavier(), ctx=ctx)
losses_2_adam = train(net2, train_data, test_data, epochs=15, optimizer='adam', learning_rate=0.01, ctx=ctx)
Borrowing ideas from Deep Residual Learning for Image Recognition (He, et al.) to build a complex deep network that is aggressively regularized, thanks to the dropout layers, to avoid over-fitting, but still achieves good performance.
def get_residual_block(hidden=64):
block = gluon.nn.HybridSequential()
block.add(
gluon.nn.Dense(hidden, activation='relu'),
gluon.nn.Dropout(0.5),
gluon.nn.Dense(hidden)
)
return block
class ResNetMatrixFactorization(gluon.HybridBlock):
def __init__(self, k, hidden, max_user=max_user, max_item=max_item):
super(ResNetMatrixFactorization, self).__init__()
# user feature lookup
self.user_embedding = gluon.nn.Embedding(input_dim=max_user, output_dim = k)
self.user_block1 = get_residual_block(hidden)
self.user_dropout = gluon.nn.Dropout(0.5)
self.user_block2 = get_residual_block(hidden)
# item feature lookup
self.item_embedding = gluon.nn.Embedding(input_dim=max_item, output_dim = k)
self.item_block1 = get_residual_block(hidden)
self.item_dropout = gluon.nn.Dropout(0.5)
self.item_block2 = get_residual_block(hidden)
def forward(self, user, item):
user_embeddings = self.user_embedding(user)
user_block1 = self.user_block1(user_embeddings)
user1 = npx.relu(user_embeddings + user_block1)
user2 = self.user_dropout(user1)
user_block2 = self.user_block2(user2)
user_transformed = npx.relu(user2 + user_block2)
item_embeddings = self.item_embedding(item)
item_block1 = self.item_block1(item_embeddings)
item1 = npx.relu(item_embeddings + item_block1)
item2 = self.item_dropout(item1)
item_block2 = self.item_block2(item2)
item_transformed = npx.relu(item2 + item_block2)
# predict by the inner product, which is elementwise product and then sum
pred = (user_transformed * item_transformed).sum(axis=1)
return pred.flatten()
net3 = ResNetMatrixFactorization(128, 128)
net3.initialize(mx.init.Xavier(), ctx=ctx)
mx.viz.plot_network(net3(mx.sym.var('user'), mx.sym.var('item')), node_attrs={"fixedsize":"false"})
net3.summary(user.as_in_context(ctx[0]), item.as_in_context(ctx[0]))
losses_3 = train(net3, train_data, test_data, epochs=15, optimizer='adam', learning_rate=0.001, ctx=ctx, num_epoch_lr=10)
Contrary to the linear model where we can use directly the embedding weights, here we compute each combination of user / items and store predicted rating.
%%time
users = []
items = []
for i in range(max_user):
for j in range(max_item):
users.append(i+1)
items.append(j+1)
dataset = gluon.data.ArrayDataset(onp.array(users).astype('float32'), onp.array(items).astype('float32'))
dataloader = gluon.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)
ratings = onp.zeros((max_user+1, max_item+1))
for users, items in dataloader:
users = users.to_device(ctx[0])
items = items.to_device(ctx[0])
scores = net3(users, items).asnumpy()
ratings[users.asnumpy().astype('int32'), items.asnumpy().astype('int32')] = scores.reshape(-1)
evaluate_embeddings(ratings)
Now let's draw a single chart that compares the learning curves of the two different models.
train_1, test_1 = list(zip(*losses_1))
train_1a, test_1a = list(zip(*losses_1_adam))
train_2, test_2 = list(zip(*losses_2))
train_2a, test_2a = list(zip(*losses_2_adam))
train_3a, test_3a = list(zip(*losses_3))
losses_1_adam
plt.figure(figsize=(20,20))
plt.xlabel('epochs')
plt.ylabel('loss')
plt.title('Evolution of training and testing losses')
x = range(15)
h1, = plt.plot(x, test_1, 'c', label='test loss Linear')
h2, = plt.plot(x, train_1, 'c--', label='train loss Linear')
h3, = plt.plot(x, test_1a, 'b', label='test loss Linear Adam')
h4, = plt.plot(x, train_1a, 'b--', label='train loss Linear Adam')
h5, = plt.plot(x, test_2, 'r', label='test loss MLP')
h6, = plt.plot(x, train_2, 'r--', label='train loss MLP')
h7, = plt.plot(x, test_2a, 'm', label='test loss MLP Adam')
h8, = plt.plot(x, train_2a, 'm--', label='train loss MLP Adam')
h9, = plt.plot(x, test_3a, 'g', label='test loss ResNet Adam')
h10, = plt.plot(x, train_3a, 'g--', label='train loss ResNet Adam')
l = plt.legend(handles=[h1, h2, h3, h4, h5, h6, h7, h8, h9, h10])
This tutorial is inspired by some examples from xlvector/github.