Licensed to the Apache Software Foundation (ASF) under one

or more contributor license agreements. See the NOTICE file

distributed with this work for additional information

regarding copyright ownership. The ASF licenses this file

to you under the Apache License, Version 2.0 (the

"License"); you may not use this file except in compliance

with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,

software distributed under the License is distributed on an

"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

KIND, either express or implied. See the License for the

specific language governing permissions and limitations

under the License.

Matrix Factorization (MF) Recommender Example

Demonstrates matrix factorization with MXNet on the MovieLens 100k dataset. We perform collaborative filtering, where the recommendations are based on previous rating of users.

We are trying to learn embeddings for users and movies, based on user partial ratings of movies, to estimate future movie ratings

For more deep learning based architecture for recommendation, refer to this survey: Deep Learning based Recommender System: A Survey and New Perspectives

python

import matplotlib.pyplot as plt
import mxnet as mx
from mxnet import gluon, np, npx, autograd
import numpy as onp

from matrix_fact import train
from movielens_data import get_dataset, max_id

Config

python

ctx = [mx.gpu(0)] if mx.device.num_gpus() > 0 else [mx.cpu()]
batch_size = 128

Data

python

train_dataset, test_dataset = get_dataset()
max_user, max_item = max_id('./ml-100k/u.data')
(max_user, max_item)

python

train_data = gluon.data.DataLoader(train_dataset, shuffle=True, last_batch='rollover', batch_size=batch_size, num_workers=0)
test_data = gluon.data.DataLoader(test_dataset, shuffle=True, batch_size=batch_size, num_workers=0)

python

for user, item, score in test_data:
    print(user[0], item[0], score[0])
    break

Linear Matrix Factorization

python

class LinearMatrixFactorization(gluon.HybridBlock):
    
    def __init__(self, k, max_user=max_user, max_item=max_item):
        super(LinearMatrixFactorization, self).__init__()
        
        # user feature lookup
        self.user_embedding = gluon.nn.Embedding(input_dim=max_user, output_dim = k) 

        # item feature lookup
        self.item_embedding = gluon.nn.Embedding(input_dim=max_item, output_dim = k) 
    
    def forward(self, user, item):
        user_embeddings = npx.relu(self.user_embedding(user))
        items_embeddings = npx.relu(self.item_embedding(item))
        
        # predict by the inner product, which is elementwise product and then sum
        pred = (user_embeddings * items_embeddings).sum(axis=1)
        
        return pred.flatten()

net1 = LinearMatrixFactorization(64)
net1.initialize(mx.init.Xavier(), ctx=ctx)
mx.viz.plot_network(net1(mx.sym.var('user'), mx.sym.var('item')), node_attrs={"fixedsize":"false"})

python

net1.summary(user.to_device(ctx[0]), item.to_device(ctx[0]))

python

losses_1 = train(net1, train_data, test_data, epochs=15, learning_rate=1, ctx=ctx)

python

losses_1

The optimizer used for training and hyper-parameter influence greatly how fast the model converge. We can try with the Adam optimizer which will often converge much faster than SGD without momentum as we used before. You should see this model over-fitting quickly.

python

net1 = LinearMatrixFactorization(64)
net1.initialize(mx.init.Xavier(), ctx=ctx)

python

losses_1_adam = train(net1, train_data, test_data, epochs=15, optimizer='adam', learning_rate=0.01, ctx=ctx)

Visualizing embeddings

python

ratings = np.dot(net1.user_embedding.weight.data(ctx=ctx[0]), net1.item_embedding.weight.data(ctx=ctx[0]).T).asnumpy()
ratings.shape

python

# Helper function to print the recommendation matrix
# And the top 5 movies in several categories

def evaluate_embeddings(ratings):
    plt.figure(figsize=(15,15))
    plt.xlabel('items')
    plt.ylabel('users')
    plt.title('Users estimated ratings of items sorted by mean ratings across users')
    im = plt.imshow(((ratings[:, ratings.mean(axis=0).argsort()[::-1]])))
    cb = plt.colorbar(im,fraction=0.026, pad=0.04, label="score")
    
    top_5_movies = ratings.mean(axis=0).argsort()[::-1][:5] # Highest mean projected rating
    worst_5_movies = ratings.mean(axis=0).argsort()[:5] # Lowest mean projected rating
    top_5_controversial = ratings.std(axis=0).argsort()[::-1][:5] # With most variance
    
    with open('ml-100k/u.item', 'rb') as f:
        movies = f.readlines()
        
    print("Top 5 movies:")
    for movie in top_5_movies:
        print("{}, average rating {:.2f}".format(str(movies[int(movie)-1]).split("|")[1], ratings.mean(axis=0)[movie]))
    print("\nWorst 5 movies:")
    for movie in worst_5_movies:
        print("{}, average rating {:.2f}".format(str(movies[int(movie)-1]).split("|")[1], ratings.mean(axis=0)[movie]))
    print("\n5 most controversial movies:")
    for movie in top_5_controversial:
        print("{}, average rating {:.2f}".format(str(movies[int(movie)-1]).split("|")[1], ratings.mean(axis=0)[movie]))

python

evaluate_embeddings(ratings)

We can observe that some movies tend to be widely recommended or not recommended, whilst some other have more variance in their predicted score

Neural Network (non-linear) Matrix Factorization

We don't have to limit ourselves to the weights of the linear embedding layer for our user or item embeddings. We can have a more complex pipeline combining fully connected layers and non-linear activations.

python

class MLPMatrixFactorization(gluon.HybridBlock):
    
    def __init__(self, k, hidden, max_user=max_user, max_item=max_item):
        super(MLPMatrixFactorization, self).__init__()
        
        # user feature lookup
        self.user_embedding = gluon.nn.Embedding(input_dim=max_user, output_dim = k) 
        self.user_mlp = gluon.nn.Dense(hidden)

        # item feature lookup
        self.item_embedding = gluon.nn.Embedding(input_dim=max_item, output_dim = k) 
        self.item_mlp = gluon.nn.Dense(hidden)
    
    def forward(self, user, item):
        user_embeddings = self.user_embedding(user)
        user_embeddings_relu = npx.relu(user_embeddings)
        user_transformed = self.user_mlp(user_embeddings_relu)
        
        items_embeddings = self.item_embedding(item)
        items_embeddings_relu = npx.relu(items_embeddings)
        items_transformed = self.item_mlp(items_embeddings_relu)
        
        # predict by the inner product, which is elementwise product and then sum
        pred = (user_transformed * items_transformed).sum(axis=1)
        
        return pred.flatten()

net2 = MLPMatrixFactorization(64, 64)
net2.initialize(mx.init.Xavier(), ctx=ctx)
mx.viz.plot_network(net2(mx.sym.var('user'), mx.sym.var('item')), node_attrs={"fixedsize":"false"})

python

net2.summary(user.to_device(ctx[0]), item.to_device(ctx[0]))

python

losses_2 = train(net2, train_data, test_data, epochs=15, ctx=ctx)

We can try training with the Adam optimizer instead

python

net2 = MLPMatrixFactorization(64, 64)
net2.initialize(mx.init.Xavier(), ctx=ctx)

python

losses_2_adam  = train(net2, train_data, test_data, epochs=15, optimizer='adam', learning_rate=0.01, ctx=ctx)

Deep Neural Network (Residual Network / ResNet)

Borrowing ideas from Deep Residual Learning for Image Recognition (He, et al.) to build a complex deep network that is aggressively regularized, thanks to the dropout layers, to avoid over-fitting, but still achieves good performance.

python

def get_residual_block(hidden=64):
    block = gluon.nn.HybridSequential()
    block.add(
        gluon.nn.Dense(hidden, activation='relu'),
        gluon.nn.Dropout(0.5),
        gluon.nn.Dense(hidden)
    )
    return block
    
class ResNetMatrixFactorization(gluon.HybridBlock):
    
    def __init__(self, k, hidden, max_user=max_user, max_item=max_item):
        super(ResNetMatrixFactorization, self).__init__()
        
        # user feature lookup
        self.user_embedding = gluon.nn.Embedding(input_dim=max_user, output_dim = k)
        self.user_block1 = get_residual_block(hidden)
        self.user_dropout = gluon.nn.Dropout(0.5)
        self.user_block2 = get_residual_block(hidden)           
        
        # item feature lookup
        self.item_embedding = gluon.nn.Embedding(input_dim=max_item, output_dim = k)
        self.item_block1 = get_residual_block(hidden)
        self.item_dropout = gluon.nn.Dropout(0.5)
        self.item_block2 = get_residual_block(hidden)           
            
    
    def forward(self, user, item):
        user_embeddings = self.user_embedding(user)
        user_block1 = self.user_block1(user_embeddings)
        user1 = npx.relu(user_embeddings + user_block1)
        
        user2 = self.user_dropout(user1)
        user_block2 = self.user_block2(user2)
        user_transformed = npx.relu(user2 + user_block2)
        
        item_embeddings = self.item_embedding(item)
        item_block1 = self.item_block1(item_embeddings)
        item1 = npx.relu(item_embeddings + item_block1)
        
        item2 = self.item_dropout(item1)
        item_block2 = self.item_block2(item2)
        item_transformed = npx.relu(item2 + item_block2)
        
        # predict by the inner product, which is elementwise product and then sum
        pred = (user_transformed * item_transformed).sum(axis=1)
        
        return pred.flatten()

net3 = ResNetMatrixFactorization(128, 128)
net3.initialize(mx.init.Xavier(), ctx=ctx)
mx.viz.plot_network(net3(mx.sym.var('user'), mx.sym.var('item')), node_attrs={"fixedsize":"false"})

python

net3.summary(user.as_in_context(ctx[0]), item.as_in_context(ctx[0]))

python

losses_3  = train(net3, train_data, test_data, epochs=15, optimizer='adam', learning_rate=0.001, ctx=ctx, num_epoch_lr=10)

Visualizing embeddings

Contrary to the linear model where we can use directly the embedding weights, here we compute each combination of user / items and store predicted rating.

python

%%time

users = []
items = []
for i in range(max_user):
    for j in range(max_item):
        users.append(i+1)
        items.append(j+1)
dataset = gluon.data.ArrayDataset(onp.array(users).astype('float32'), onp.array(items).astype('float32'))
dataloader = gluon.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)
ratings = onp.zeros((max_user+1, max_item+1))
for users, items in dataloader:
    users = users.to_device(ctx[0])
    items = items.to_device(ctx[0])
    scores = net3(users, items).asnumpy()
    ratings[users.asnumpy().astype('int32'), items.asnumpy().astype('int32')] = scores.reshape(-1)

python

evaluate_embeddings(ratings)

Visualizing training

Now let's draw a single chart that compares the learning curves of the two different models.

python

train_1,  test_1  = list(zip(*losses_1))
train_1a, test_1a = list(zip(*losses_1_adam))
train_2,  test_2  = list(zip(*losses_2))
train_2a, test_2a = list(zip(*losses_2_adam))
train_3a, test_3a = list(zip(*losses_3))

python

losses_1_adam

python

plt.figure(figsize=(20,20))
plt.xlabel('epochs')
plt.ylabel('loss')
plt.title('Evolution of training and testing losses')
x = range(15)
h1,  = plt.plot(x, test_1, 'c', label='test loss Linear')
h2,  = plt.plot(x, train_1, 'c--', label='train loss Linear')
h3,  = plt.plot(x, test_1a, 'b', label='test loss Linear Adam')
h4,  = plt.plot(x, train_1a, 'b--', label='train loss Linear Adam')
h5,  = plt.plot(x, test_2, 'r', label='test loss MLP')
h6,  = plt.plot(x, train_2, 'r--', label='train loss MLP')
h7,  = plt.plot(x, test_2a, 'm', label='test loss MLP Adam')
h8,  = plt.plot(x, train_2a, 'm--', label='train loss MLP Adam')
h9,  = plt.plot(x, test_3a, 'g', label='test loss ResNet Adam')
h10, = plt.plot(x, train_3a, 'g--', label='train loss ResNet Adam')
l   = plt.legend(handles=[h1, h2, h3, h4, h5, h6, h7, h8, h9, h10])

Acknowledgement

This tutorial is inspired by some examples from xlvector/github.