Back to Recommenders

MIND Utils Generation

examples/01_prepare_data/mind_utils.ipynb

1.2.16.3 KB
Original Source

<i>Copyright (c) Recommenders contributors.</i>

<i>Licensed under the MIT License.</i>

MIND Utils Generation

MIND dataset[1] is a large-scale English news dataset. It was collected from anonymized behavior logs of Microsoft News website. MIND contains 1,000,000 users, 161,013 news articles and 15,777,377 impression logs. Every news article contains rich textual content including title, abstract, body, category and entities. Each impression log contains the click events, non-clicked events and historical news click behaviors of this user before this impression.

Many news recommendation methods use word embeddings, news vertical embeddings, news subvertical embeddings and user id embedding. Therefore, it is necessary to generate a word dictionary, a vertical dictionary, a subvertical dictionary and a userid dictionary to convert words, news verticals, subverticals and user ids from strings to indexes. To use the pretrain word embedding, an embedding matrix is generated as the initial weight of the word embedding layer.

This notebook gives examples about how to generate:

  • word_dict.pkl: convert the words in news titles into indexes.
  • word_dict_all.pkl: convert the words in news titles and abstracts into indexes.
  • embedding.npy: pretrained word embedding matrix of words in word_dict.pkl
  • embedding_all.npy: pretrained embedding matrix of words in word_dict_all.pkl
  • vert_dict.pkl: convert news verticals into indexes.
  • subvert_dict.pkl: convert news subverticals into indexes.
  • uid2index.pkl: convert user ids into indexes.
python
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
from collections import Counter
from tempfile import TemporaryDirectory

from recommenders.datasets.mind import (download_mind,
                                     extract_mind,
                                     download_and_extract_glove,
                                     load_glove_matrix,
                                     word_tokenize
                                    )
from recommenders.datasets.download_utils import unzip_file
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))

python
# MIND sizes: "demo", "small" or "large"
mind_type="demo" 
# word_embedding_dim should be in [50, 100, 200, 300]
word_embedding_dim = 300
python
tmpdir = TemporaryDirectory()
data_path = tmpdir.name
train_zip, valid_zip = download_mind(size=mind_type, dest_path=data_path)
unzip_file(train_zip, os.path.join(data_path, 'train'), clean_zip_file=False)
unzip_file(valid_zip, os.path.join(data_path, 'valid'), clean_zip_file=False)
output_path = os.path.join(data_path, 'utils')
os.makedirs(output_path, exist_ok=True)

Prepare utils of news

  • word dictionary
  • vertical dictionary
  • subvetical dictionary
python
news = pd.read_table(os.path.join(data_path, 'train', 'news.tsv'),
                     names=['newid', 'vertical', 'subvertical', 'title',
                            'abstract', 'url', 'entities in title', 'entities in abstract'],
                     usecols = ['vertical', 'subvertical', 'title', 'abstract'])
python
news.head()
python
news_vertical = news.vertical.drop_duplicates().reset_index(drop=True)
vert_dict_inv = news_vertical.to_dict()
vert_dict = {v: k+1 for k, v in vert_dict_inv.items()}

news_subvertical = news.subvertical.drop_duplicates().reset_index(drop=True)
subvert_dict_inv = news_subvertical.to_dict()
subvert_dict = {v: k+1 for k, v in vert_dict_inv.items()}
python
news.title = news.title.apply(word_tokenize)
news.abstract = news.abstract.apply(word_tokenize)
python
word_cnt = Counter()
word_cnt_all = Counter()

for i in tqdm(range(len(news))):
    word_cnt.update(news.loc[i]['title'])
    word_cnt_all.update(news.loc[i]['title'])
    word_cnt_all.update(news.loc[i]['abstract'])
python
word_dict = {k: v+1 for k, v in zip(word_cnt, range(len(word_cnt)))}
word_dict_all = {k: v+1 for k, v in zip(word_cnt_all, range(len(word_cnt_all)))}
python
with open(os.path.join(output_path, 'vert_dict.pkl'), 'wb') as f:
    pickle.dump(vert_dict, f)
    
with open(os.path.join(output_path, 'subvert_dict.pkl'), 'wb') as f:
    pickle.dump(subvert_dict, f)

with open(os.path.join(output_path, 'word_dict.pkl'), 'wb') as f:
    pickle.dump(word_dict, f)
    
with open(os.path.join(output_path, 'word_dict_all.pkl'), 'wb') as f:
    pickle.dump(word_dict_all, f)

Prepare embedding matrixs

  • embedding.npy
  • embedding_all.npy
python
glove_path = download_and_extract_glove(data_path)
python
embedding_matrix, exist_word = load_glove_matrix(glove_path, word_dict, word_embedding_dim)
embedding_all_matrix, exist_all_word = load_glove_matrix(glove_path, word_dict_all, word_embedding_dim)
python
np.save(os.path.join(output_path, 'embedding.npy'), embedding_matrix)
np.save(os.path.join(output_path, 'embedding_all.npy'), embedding_all_matrix)

Prepare uid2index.pkl

python
uid2index = {}

with open(os.path.join(data_path, 'train', 'behaviors.tsv'), 'r') as f:
    for l in tqdm(f):
        uid = l.strip('\n').split('\t')[1]
        if uid not in uid2index:
            uid2index[uid] = len(uid2index) + 1
python
with open(os.path.join(output_path, 'uid2index.pkl'), 'wb') as f:
    pickle.dump(uid2index, f)
python
utils_state = {
    'vert_num': len(vert_dict),
    'subvert_num': len(subvert_dict),
    'word_num': len(word_dict),
    'word_num_all': len(word_dict_all),
    'embedding_exist_num': len(exist_word),
    'embedding_exist_num_all': len(exist_all_word),
    'uid2index': len(uid2index)
}
utils_state
python
# Record results for tests - ignore this cell
store_metadata("vert_num", len(vert_dict))
store_metadata("subvert_num", len(subvert_dict))
store_metadata("word_num", len(word_dict))
store_metadata("word_num_all", len(word_dict_all))
store_metadata("embedding_exist_num", len(exist_word))
store_metadata("embedding_exist_num_all", len(exist_all_word))
store_metadata("uid2index", len(uid2index))
python
tmpdir.cleanup()

References

[1] Wu, Fangzhao, et al. "MIND: A Large-scale Dataset for News Recommendation" Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics. https://msnews.github.io/competition.html