Unverified Commit 9737b270 authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[Model][Hetero] GCMC using new hetero APIs (#860)

* init

* rm data

* README

* fix rating values that are not decimal

* rm stale codes

* small fix

* upd

* rewrite decoder

* fix many

* many fix; performance matched

* upd; handle sparse input

* upd

* address comments

* more docstring; download data automatically

* shared param mode
parent e3921d5d
# Graph Convolutional Matrix Completion
Paper link: [https://arxiv.org/abs/1706.02263](https://arxiv.org/abs/1706.02263)
Author's code: [https://github.com/riannevdberg/gc-mc](https://github.com/riannevdberg/gc-mc)
The implementation does not handle side-channel features and mini-epoching and thus achieves
slightly worse performance when using node features.
Credit: Jiani Zhang ([@jennyzhang0215](https://github.com/jennyzhang0215))
## Dependencies
* MXNet 1.5.0+
* pandas
* gluonnlp
## Data
Supported datasets: ml-100k, ml-1m, ml-10m
## How to run
ml-100k, no feature
```bash
DGLBACKEND=mxnet python train.py --data_name=ml-100k --use_one_hot_fea --gcn_agg_accum=stack
```
Results: RMSE=0.9077 (0.910 reported)
Speed: 0.0246s/epoch (vanilla implementation: 0.1008s/epoch)
ml-100k, with feature
```bash
DGLBACKEND=mxnet python train.py --data_name=ml-100k --gcn_agg_accum=stack
```
Results: RMSE=0.9495 (0.905 reported)
ml-1m, no feature
```bash
DGLBACKEND=mxnet python train.py --data_name=ml-1m --gcn_agg_accum=sum --use_one_hot_fea
```
Results: RMSE=0.8377 (0.832 reported)
Speed: 0.0695s/epoch (vanilla implementation: 1.538s/epoch)
ml-10m, no feature
```bash
DGLBACKEND=mxnet python train.py --data_name=ml-10m --gcn_agg_accum=stack --gcn_dropout=0.3 \
--train_lr=0.001 --train_min_lr=0.0001 --train_max_iter=15000 \
--use_one_hot_fea --gen_r_num_basis_func=4
```
Results: RMSE=0.7875 (0.777 reported)
Speed: 0.6480s/epoch (vanilla implementation: OOM)
Testbed: EC2 p3.2xlarge
"""MovieLens dataset"""
import numpy as np
import os
import re
import pandas as pd
import scipy.sparse as sp
import gluonnlp as nlp
import mxnet as mx
import dgl
from dgl.data.utils import download, extract_archive, get_download_dir
_urls = {
'ml-100k' : 'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
'ml-1m' : 'http://files.grouplens.org/datasets/movielens/ml-1m.zip',
'ml-10m' : 'http://files.grouplens.org/datasets/movielens/ml-10m.zip',
}
READ_DATASET_PATH = get_download_dir()
GENRES_ML_100K =\
['unknown', 'Action', 'Adventure', 'Animation',
'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
'Thriller', 'War', 'Western']
GENRES_ML_1M = GENRES_ML_100K[1:]
GENRES_ML_10M = GENRES_ML_100K + ['IMAX']
class MovieLens(object):
"""MovieLens dataset used by GCMC model
TODO(minjie): make this dataset more general
The dataset stores MovieLens ratings in two types of graphs. The encoder graph
contains rating value information in the form of edge types. The decoder graph
stores plain user-movie pairs in the form of a bipartite graph with no rating
information. All graphs have two types of nodes: "user" and "movie".
The training, validation and test set can be summarized as follows:
training_enc_graph : training user-movie pairs + rating info
training_dec_graph : training user-movie pairs
valid_enc_graph : training user-movie pairs + rating info
valid_dec_graph : validation user-movie pairs
test_enc_graph : training user-movie pairs + validation user-movie pairs + rating info
test_dec_graph : test user-movie pairs
Attributes
----------
train_enc_graph : dgl.DGLHeteroGraph
Encoder graph for training.
train_dec_graph : dgl.DGLHeteroGraph
Decoder graph for training.
train_labels : mx.nd.NDArray
The categorical label of each user-movie pair
train_truths : mx.nd.NDArray
The actual rating values of each user-movie pair
valid_enc_graph : dgl.DGLHeteroGraph
Encoder graph for validation.
valid_dec_graph : dgl.DGLHeteroGraph
Decoder graph for validation.
valid_labels : mx.nd.NDArray
The categorical label of each user-movie pair
valid_truths : mx.nd.NDArray
The actual rating values of each user-movie pair
test_enc_graph : dgl.DGLHeteroGraph
Encoder graph for test.
test_dec_graph : dgl.DGLHeteroGraph
Decoder graph for test.
test_labels : mx.nd.NDArray
The categorical label of each user-movie pair
test_truths : mx.nd.NDArray
The actual rating values of each user-movie pair
user_feature : mx.nd.NDArray
User feature tensor. If None, representing an identity matrix.
movie_feature : mx.nd.NDArray
Movie feature tensor. If None, representing an identity matrix.
possible_rating_values : np.ndarray
Available rating values in the dataset
Parameters
----------
name : str
Dataset name. Could be "ml-100k", "ml-1m", "ml-10m"
ctx : mx.context.Context
Device context
use_one_hot_fea : bool, optional
If true, the ``user_feature`` attribute is None, representing an one-hot identity
matrix. (Default: False)
symm : bool, optional
If true, the use symmetric normalize constant. Otherwise, use left normalize
constant. (Default: True)
test_ratio : float, optional
Ratio of test data
valid_ratio : float, optional
Ratio of validation data
"""
def __init__(self, name, ctx, use_one_hot_fea=False, symm=True,
test_ratio=0.1, valid_ratio=0.1):
self._name = name
self._ctx = ctx
self._symm = symm
self._test_ratio = test_ratio
self._valid_ratio = valid_ratio
# download and extract
download_dir = get_download_dir()
zip_file_path = '{}/{}.zip'.format(download_dir, name)
download(_urls[name], path=zip_file_path)
extract_archive(zip_file_path, '{}/{}'.format(download_dir, name))
if name == 'ml-10m':
root_folder = 'ml-10M100K'
else:
root_folder = name
self._dir = os.path.join(download_dir, name, root_folder)
print("Starting processing {} ...".format(self._name))
self._load_raw_user_info()
self._load_raw_movie_info()
print('......')
if self._name == 'ml-100k':
self.all_train_rating_info = self._load_raw_rates(os.path.join(self._dir, 'u1.base'), '\t')
self.test_rating_info = self._load_raw_rates(os.path.join(self._dir, 'u1.test'), '\t')
self.all_rating_info = pd.concat([self.all_train_rating_info, self.test_rating_info])
elif self._name == 'ml-1m' or self._name == 'ml-10m':
self.all_rating_info = self._load_raw_rates(os.path.join(self._dir, 'ratings.dat'), '::')
num_test = int(np.ceil(self.all_rating_info.shape[0] * self._test_ratio))
shuffled_idx = np.random.permutation(self.all_rating_info.shape[0])
self.test_rating_info = self.all_rating_info.iloc[shuffled_idx[: num_test]]
self.all_train_rating_info = self.all_rating_info.iloc[shuffled_idx[num_test: ]]
else:
raise NotImplementedError
print('......')
num_valid = int(np.ceil(self.all_train_rating_info.shape[0] * self._valid_ratio))
shuffled_idx = np.random.permutation(self.all_train_rating_info.shape[0])
self.valid_rating_info = self.all_train_rating_info.iloc[shuffled_idx[: num_valid]]
self.train_rating_info = self.all_train_rating_info.iloc[shuffled_idx[num_valid: ]]
self.possible_rating_values = np.unique(self.train_rating_info["rating"].values)
print("All rating pairs : {}".format(self.all_rating_info.shape[0]))
print("\tAll train rating pairs : {}".format(self.all_train_rating_info.shape[0]))
print("\t\tTrain rating pairs : {}".format(self.train_rating_info.shape[0]))
print("\t\tValid rating pairs : {}".format(self.valid_rating_info.shape[0]))
print("\tTest rating pairs : {}".format(self.test_rating_info.shape[0]))
self.user_info = self._drop_unseen_nodes(orign_info=self.user_info,
cmp_col_name="id",
reserved_ids_set=set(self.all_rating_info["user_id"].values),
label="user")
self.movie_info = self._drop_unseen_nodes(orign_info=self.movie_info,
cmp_col_name="id",
reserved_ids_set=set(self.all_rating_info["movie_id"].values),
label="movie")
# Map user/movie to the global id
self.global_user_id_map = {ele: i for i, ele in enumerate(self.user_info['id'])}
self.global_movie_id_map = {ele: i for i, ele in enumerate(self.movie_info['id'])}
print('Total user number = {}, movie number = {}'.format(len(self.global_user_id_map),
len(self.global_movie_id_map)))
self._num_user = len(self.global_user_id_map)
self._num_movie = len(self.global_movie_id_map)
### Generate features
if use_one_hot_fea:
self.user_feature = None
self.movie_feature = None
else:
self.user_feature = mx.nd.array(self._process_user_fea(), ctx=ctx, dtype=np.float32)
self.movie_feature = mx.nd.array(self._process_movie_fea(), ctx=ctx, dtype=np.float32)
if self.user_feature is None:
self.user_feature_shape = (self.num_user, self.num_user)
self.movie_feature_shape = (self.num_movie, self.num_movie)
else:
self.user_feature_shape = self.user_feature.shape
self.movie_feature_shape = self.movie_feature.shape
info_line = "Feature dim: "
info_line += "\nuser: {}".format(self.user_feature_shape)
info_line += "\nmovie: {}".format(self.movie_feature_shape)
print(info_line)
all_train_rating_pairs, all_train_rating_values = self._generate_pair_value(self.all_train_rating_info)
train_rating_pairs, train_rating_values = self._generate_pair_value(self.train_rating_info)
valid_rating_pairs, valid_rating_values = self._generate_pair_value(self.valid_rating_info)
test_rating_pairs, test_rating_values = self._generate_pair_value(self.test_rating_info)
def _make_labels(ratings):
labels = mx.nd.array(np.searchsorted(self.possible_rating_values, ratings),
ctx=ctx, dtype=np.int32)
return labels
self.train_enc_graph = self._generate_enc_graph(train_rating_pairs, train_rating_values, add_support=True)
self.train_dec_graph = self._generate_dec_graph(train_rating_pairs)
self.train_labels = _make_labels(train_rating_values)
self.train_truths = mx.nd.array(train_rating_values, ctx=ctx, dtype=np.float32)
self.valid_enc_graph = self.train_enc_graph
self.valid_dec_graph = self._generate_dec_graph(valid_rating_pairs)
self.valid_labels = _make_labels(valid_rating_values)
self.valid_truths = mx.nd.array(valid_rating_values, ctx=ctx, dtype=np.float32)
self.test_enc_graph = self._generate_enc_graph(all_train_rating_pairs, all_train_rating_values, add_support=True)
self.test_dec_graph = self._generate_dec_graph(test_rating_pairs)
self.test_labels = _make_labels(test_rating_values)
self.test_truths = mx.nd.array(test_rating_values, ctx=ctx, dtype=np.float32)
def _npairs(graph):
rst = 0
for r in self.possible_rating_values:
rst += graph.number_of_edges(str(r))
return rst
print("Train enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.train_enc_graph.number_of_nodes('user'), self.train_enc_graph.number_of_nodes('movie'),
_npairs(self.train_enc_graph)))
print("Train dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.train_dec_graph.number_of_nodes('user'), self.train_dec_graph.number_of_nodes('movie'),
self.train_dec_graph.number_of_edges()))
print("Valid enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.valid_enc_graph.number_of_nodes('user'), self.valid_enc_graph.number_of_nodes('movie'),
_npairs(self.valid_enc_graph)))
print("Valid dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.valid_dec_graph.number_of_nodes('user'), self.valid_dec_graph.number_of_nodes('movie'),
self.valid_dec_graph.number_of_edges()))
print("Test enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.test_enc_graph.number_of_nodes('user'), self.test_enc_graph.number_of_nodes('movie'),
_npairs(self.test_enc_graph)))
print("Test dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.test_dec_graph.number_of_nodes('user'), self.test_dec_graph.number_of_nodes('movie'),
self.test_dec_graph.number_of_edges()))
def _generate_pair_value(self, rating_info):
rating_pairs = (np.array([self.global_user_id_map[ele] for ele in rating_info["user_id"]],
dtype=np.int64),
np.array([self.global_movie_id_map[ele] for ele in rating_info["movie_id"]],
dtype=np.int64))
rating_values = rating_info["rating"].values.astype(np.float32)
return rating_pairs, rating_values
def _generate_enc_graph(self, rating_pairs, rating_values, add_support=False):
user_movie_R = np.zeros((self._num_user, self._num_movie), dtype=np.float32)
user_movie_R[rating_pairs] = rating_values
movie_user_R = user_movie_R.transpose()
rating_graphs = []
rating_row, rating_col = rating_pairs
for rating in self.possible_rating_values:
ridx = np.where(rating_values == rating)
rrow = rating_row[ridx]
rcol = rating_col[ridx]
bg = dgl.bipartite((rrow, rcol), 'user', str(rating), 'movie',
card=(self._num_user, self._num_movie))
rev_bg = dgl.bipartite((rcol, rrow), 'movie', 'rev-%s' % str(rating), 'user',
card=(self._num_movie, self._num_user))
rating_graphs.append(bg)
rating_graphs.append(rev_bg)
graph = dgl.hetero_from_relations(rating_graphs)
# sanity check
assert len(rating_pairs[0]) == sum([graph.number_of_edges(et) for et in graph.etypes]) // 2
if add_support:
def _calc_norm(x):
x = x.asnumpy().astype('float32')
x[x == 0.] = np.inf
x = mx.nd.array(1. / np.sqrt(x))
return x.as_in_context(self._ctx).expand_dims(1)
user_ci = []
user_cj = []
movie_ci = []
movie_cj = []
for r in self.possible_rating_values:
r = str(r)
user_ci.append(graph['rev-%s' % r].in_degrees())
movie_ci.append(graph[r].in_degrees())
if self._symm:
user_cj.append(graph[r].out_degrees())
movie_cj.append(graph['rev-%s' % r].out_degrees())
else:
user_cj.append(mx.nd.zeros((self.num_user,)))
movie_cj.append(mx.nd.zeros((self.num_movie,)))
user_ci = _calc_norm(mx.nd.add_n(*user_ci))
movie_ci = _calc_norm(mx.nd.add_n(*movie_ci))
if self._symm:
user_cj = _calc_norm(mx.nd.add_n(*user_cj))
movie_cj = _calc_norm(mx.nd.add_n(*movie_cj))
else:
user_cj = mx.nd.ones((self.num_user,), ctx=self._ctx)
movie_cj = mx.nd.ones((self.num_movie,), ctx=self._ctx)
graph.nodes['user'].data.update({'ci' : user_ci, 'cj' : user_cj})
graph.nodes['movie'].data.update({'ci' : movie_ci, 'cj' : movie_cj})
return graph
def _generate_dec_graph(self, rating_pairs):
ones = np.ones_like(rating_pairs[0])
user_movie_ratings_coo = sp.coo_matrix(
(ones, rating_pairs),
shape=(self.num_user, self.num_movie), dtype=np.float32)
return dgl.bipartite(user_movie_ratings_coo, 'user', 'rate', 'movie')
@property
def num_links(self):
return self.possible_rating_values.size
@property
def num_user(self):
return self._num_user
@property
def num_movie(self):
return self._num_movie
def _drop_unseen_nodes(self, orign_info, cmp_col_name, reserved_ids_set, label):
# print(" -----------------")
# print("{}: {}(reserved) v.s. {}(from info)".format(label, len(reserved_ids_set),
# len(set(orign_info[cmp_col_name].values))))
if reserved_ids_set != set(orign_info[cmp_col_name].values):
pd_rating_ids = pd.DataFrame(list(reserved_ids_set), columns=["id_graph"])
# print("\torign_info: ({}, {})".format(orign_info.shape[0], orign_info.shape[1]))
data_info = orign_info.merge(pd_rating_ids, left_on=cmp_col_name, right_on='id_graph', how='outer')
data_info = data_info.dropna(subset=[cmp_col_name, 'id_graph'])
data_info = data_info.drop(columns=["id_graph"])
data_info = data_info.reset_index(drop=True)
# print("\tAfter dropping, data shape: ({}, {})".format(data_info.shape[0], data_info.shape[1]))
return data_info
else:
orign_info = orign_info.reset_index(drop=True)
return orign_info
def _load_raw_rates(self, file_path, sep):
"""In MovieLens, the rates have the following format
ml-100k
user id \t movie id \t rating \t timestamp
ml-1m/10m
UserID::MovieID::Rating::Timestamp
timestamp is unix timestamp and can be converted by pd.to_datetime(X, unit='s')
Parameters
----------
file_path : str
Returns
-------
rating_info : pd.DataFrame
"""
rating_info = pd.read_csv(
file_path, sep=sep, header=None,
names=['user_id', 'movie_id', 'rating', 'timestamp'],
dtype={'user_id': np.int32, 'movie_id' : np.int32,
'ratings': np.float32, 'timestamp': np.int64}, engine='python')
return rating_info
def _load_raw_user_info(self):
"""In MovieLens, the user attributes file have the following formats:
ml-100k:
user id | age | gender | occupation | zip code
ml-1m:
UserID::Gender::Age::Occupation::Zip-code
For ml-10m, there is no user information. We read the user id from the rating file.
Parameters
----------
name : str
Returns
-------
user_info : pd.DataFrame
"""
if self._name == 'ml-100k':
self.user_info = pd.read_csv(os.path.join(self._dir, 'u.user'), sep='|', header=None,
names=['id', 'age', 'gender', 'occupation', 'zip_code'], engine='python')
elif self._name == 'ml-1m':
self.user_info = pd.read_csv(os.path.join(self._dir, 'users.dat'), sep='::', header=None,
names=['id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')
elif self._name == 'ml-10m':
rating_info = pd.read_csv(
os.path.join(self._dir, 'ratings.dat'), sep='::', header=None,
names=['user_id', 'movie_id', 'rating', 'timestamp'],
dtype={'user_id': np.int32, 'movie_id': np.int32, 'ratings': np.float32,
'timestamp': np.int64}, engine='python')
self.user_info = pd.DataFrame(np.unique(rating_info['user_id'].values.astype(np.int32)),
columns=['id'])
else:
raise NotImplementedError
def _process_user_fea(self):
"""
Parameters
----------
user_info : pd.DataFrame
name : str
For ml-100k and ml-1m, the column name is ['id', 'gender', 'age', 'occupation', 'zip_code'].
We take the age, gender, and the one-hot encoding of the occupation as the user features.
For ml-10m, there is no user feature and we set the feature to be a single zero.
Returns
-------
user_features : np.ndarray
"""
if self._name == 'ml-100k' or self._name == 'ml-1m':
ages = self.user_info['age'].values.astype(np.float32)
gender = (self.user_info['gender'] == 'F').values.astype(np.float32)
all_occupations = set(self.user_info['occupation'])
occupation_map = {ele: i for i, ele in enumerate(all_occupations)}
occupation_one_hot = np.zeros(shape=(self.user_info.shape[0], len(all_occupations)),
dtype=np.float32)
occupation_one_hot[np.arange(self.user_info.shape[0]),
np.array([occupation_map[ele] for ele in self.user_info['occupation']])] = 1
user_features = np.concatenate([ages.reshape((self.user_info.shape[0], 1)) / 50.0,
gender.reshape((self.user_info.shape[0], 1)),
occupation_one_hot], axis=1)
elif self._name == 'ml-10m':
user_features = np.zeros(shape=(self.user_info.shape[0], 1), dtype=np.float32)
else:
raise NotImplementedError
return user_features
def _load_raw_movie_info(self):
"""In MovieLens, the movie attributes may have the following formats:
In ml_100k:
movie id | movie title | release date | video release date | IMDb URL | [genres]
In ml_1m, ml_10m:
MovieID::Title (Release Year)::Genres
Also, Genres are separated by |, e.g., Adventure|Animation|Children|Comedy|Fantasy
Parameters
----------
name : str
Returns
-------
movie_info : pd.DataFrame
For ml-100k, the column name is ['id', 'title', 'release_date', 'video_release_date', 'url'] + [GENRES (19)]]
For ml-1m and ml-10m, the column name is ['id', 'title'] + [GENRES (18/20)]]
"""
if self._name == 'ml-100k':
GENRES = GENRES_ML_100K
elif self._name == 'ml-1m':
GENRES = GENRES_ML_1M
elif self._name == 'ml-10m':
GENRES = GENRES_ML_10M
else:
raise NotImplementedError
if self._name == 'ml-100k':
file_path = os.path.join(self._dir, 'u.item')
self.movie_info = pd.read_csv(file_path, sep='|', header=None,
names=['id', 'title', 'release_date', 'video_release_date', 'url'] + GENRES,
engine='python')
elif self._name == 'ml-1m' or self._name == 'ml-10m':
file_path = os.path.join(self._dir, 'movies.dat')
movie_info = pd.read_csv(file_path, sep='::', header=None,
names=['id', 'title', 'genres'], engine='python')
genre_map = {ele: i for i, ele in enumerate(GENRES)}
genre_map['Children\'s'] = genre_map['Children']
genre_map['Childrens'] = genre_map['Children']
movie_genres = np.zeros(shape=(movie_info.shape[0], len(GENRES)), dtype=np.float32)
for i, genres in enumerate(movie_info['genres']):
for ele in genres.split('|'):
if ele in genre_map:
movie_genres[i, genre_map[ele]] = 1.0
else:
print('genres not found, filled with unknown: {}'.format(genres))
movie_genres[i, genre_map['unknown']] = 1.0
for idx, genre_name in enumerate(GENRES):
assert idx == genre_map[genre_name]
movie_info[genre_name] = movie_genres[:, idx]
self.movie_info = movie_info.drop(columns=["genres"])
else:
raise NotImplementedError
def _process_movie_fea(self):
"""
Parameters
----------
movie_info : pd.DataFrame
name : str
Returns
-------
movie_features : np.ndarray
Generate movie features by concatenating embedding and the year
"""
if self._name == 'ml-100k':
GENRES = GENRES_ML_100K
elif self._name == 'ml-1m':
GENRES = GENRES_ML_1M
elif self._name == 'ml-10m':
GENRES = GENRES_ML_10M
else:
raise NotImplementedError
word_embedding = nlp.embedding.GloVe('glove.840B.300d')
tokenizer = nlp.data.transforms.SpacyTokenizer()
title_embedding = np.zeros(shape=(self.movie_info.shape[0], 300), dtype=np.float32)
release_years = np.zeros(shape=(self.movie_info.shape[0], 1), dtype=np.float32)
p = re.compile(r'(.+)\s*\((\d+)\)')
for i, title in enumerate(self.movie_info['title']):
match_res = p.match(title)
if match_res is None:
print('{} cannot be matched, index={}, name={}'.format(title, i, self._name))
title_context, year = title, 1950
else:
title_context, year = match_res.groups()
# We use average of glove
title_embedding[i, :] = word_embedding[tokenizer(title_context)].asnumpy().mean(axis=0)
release_years[i] = float(year)
movie_features = np.concatenate((title_embedding,
(release_years - 1950.0) / 100.0,
self.movie_info[GENRES]),
axis=1)
return movie_features
if __name__ == '__main__':
MovieLens("ml-100k", ctx=mx.cpu(), symm=True)
"""NN modules"""
import math
import numpy as np
import mxnet as mx
import mxnet.ndarray as F
from mxnet.gluon import nn, Block
import dgl.function as fn
from utils import get_activation
class GCMCLayer(Block):
r"""GCMC layer
.. math::
z_j^{(l+1)} = \sigma_{agg}\left[\mathrm{agg}\left(
\sum_{j\in\mathcal{N}_1}\frac{1}{c_{ij}}W_1h_j, \ldots,
\sum_{j\in\mathcal{N}_R}\frac{1}{c_{ij}}W_Rh_j
\right)\right]
After that, apply an extra output projection:
.. math::
h_j^{(l+1)} = \sigma_{out}W_oz_j^{(l+1)}
The equation is applied to both user nodes and movie nodes and the parameters
are not shared unless ``share_user_item_param`` is true.
Parameters
----------
rating_vals : list of int or float
Possible rating values.
user_in_units : int
Size of user input feature
movie_in_units : int
Size of movie input feature
msg_units : int
Size of message :math:`W_rh_j`
out_units : int
Size of of final output user and movie features
dropout_rate : float, optional
Dropout rate (Default: 0.0)
agg : str, optional
Function to aggregate messages of different ratings.
Could be any of the supported cross type reducers:
"sum", "max", "min", "mean", "stack".
(Default: "stack")
agg_act : callable, str, optional
Activation function :math:`sigma_{agg}`. (Default: None)
out_act : callable, str, optional
Activation function :math:`sigma_{agg}`. (Default: None)
share_user_item_param : bool, optional
If true, user node and movie node share the same set of parameters.
Require ``user_in_units`` and ``move_in_units`` to be the same.
(Default: False)
"""
def __init__(self,
rating_vals,
user_in_units,
movie_in_units,
msg_units,
out_units,
dropout_rate=0.0,
agg='stack', # or 'sum'
agg_act=None,
out_act=None,
share_user_item_param=False):
super(GCMCLayer, self).__init__()
self.rating_vals = rating_vals
self.agg = agg
self.share_user_item_param = share_user_item_param
if agg == 'stack':
# divide the original msg unit size by number of ratings to keep
# the dimensionality
assert msg_units % len(rating_vals) == 0
msg_units = msg_units // len(rating_vals)
with self.name_scope():
self.dropout = nn.Dropout(dropout_rate)
self.W_r = {}
for rating in rating_vals:
rating = str(rating)
if share_user_item_param and user_in_units == movie_in_units:
self.W_r[rating] = self.params.get(
'W_r_%s' % rating, shape=(user_in_units, msg_units),
dtype=np.float32, allow_deferred_init=True)
self.W_r['rev-%s' % rating] = self.W_r[rating]
else:
self.W_r[rating] = self.params.get(
'W_r_%s' % rating, shape=(user_in_units, msg_units),
dtype=np.float32, allow_deferred_init=True)
self.W_r['rev-%s' % rating] = self.params.get(
'revW_r_%s' % rating, shape=(movie_in_units, msg_units),
dtype=np.float32, allow_deferred_init=True)
self.ufc = nn.Dense(out_units)
if share_user_item_param:
self.ifc = self.ufc
else:
self.ifc = nn.Dense(out_units)
self.agg_act = get_activation(agg_act)
self.out_act = get_activation(out_act)
def forward(self, graph, ufeat=None, ifeat=None):
"""Forward function
Normalizer constant :math:`c_{ij}` is stored as two node data "ci"
and "cj".
Parameters
----------
graph : DGLHeteroGraph
User-movie rating graph. It should contain two node types: "user"
and "movie" and many edge types each for one rating value.
ufeat : mx.nd.NDArray, optional
User features. If None, using an identity matrix.
ifeat : mx.nd.NDArray, optional
Movie features. If None, using an identity matrix.
Returns
-------
new_ufeat : mx.nd.NDArray
New user features
new_ifeat : mx.nd.NDArray
New movie features
"""
num_u = graph.number_of_nodes('user')
num_i = graph.number_of_nodes('movie')
funcs = {}
for i, rating in enumerate(self.rating_vals):
rating = str(rating)
# W_r * x
x_u = dot_or_identity(ufeat, self.W_r[rating].data())
x_i = dot_or_identity(ifeat, self.W_r['rev-%s' % rating].data())
# left norm and dropout
x_u = x_u * self.dropout(graph.nodes['user'].data['cj'])
x_i = x_i * self.dropout(graph.nodes['movie'].data['cj'])
graph.nodes['user'].data['h%d' % i] = x_u
graph.nodes['movie'].data['h%d' % i] = x_i
funcs[rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h'))
funcs['rev-%s' % rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h'))
# message passing
graph.multi_update_all(funcs, self.agg)
ufeat = graph.nodes['user'].data.pop('h').reshape((num_u, -1))
ifeat = graph.nodes['movie'].data.pop('h').reshape((num_i, -1))
# right norm
ufeat = ufeat * graph.nodes['user'].data['ci']
ifeat = ifeat * graph.nodes['movie'].data['ci']
# fc and non-linear
ufeat = self.agg_act(ufeat)
ifeat = self.agg_act(ifeat)
ufeat = self.dropout(ufeat)
ifeat = self.dropout(ifeat)
ufeat = self.ufc(ufeat)
ifeat = self.ifc(ifeat)
return self.out_act(ufeat), self.out_act(ifeat)
class BiDecoder(Block):
r"""Bilinear decoder.
.. math::
p(M_{ij}=r) = \text{softmax}(u_i^TQ_rv_j)
The trainable parameter :math:`Q_r` is further decomposed to a linear
combination of basis weight matrices :math:`P_s`:
.. math::
Q_r = \sum_{s=1}^{b} a_{rs}P_s
Parameters
----------
rating_vals : list of int or float
Possible rating values.
in_units : int
Size of input user and movie features
num_basis_functions : int, optional
Number of basis. (Default: 2)
dropout_rate : float, optional
Dropout raite (Default: 0.0)
"""
def __init__(self,
rating_vals,
in_units,
num_basis_functions=2,
dropout_rate=0.0):
super(BiDecoder, self).__init__()
self.rating_vals = rating_vals
self._num_basis_functions = num_basis_functions
self.dropout = nn.Dropout(dropout_rate)
self.Ps = []
with self.name_scope():
for i in range(num_basis_functions):
self.Ps.append(self.params.get(
'Ps_%d' % i, shape=(in_units, in_units),
#init=mx.initializer.Orthogonal(scale=1.1, rand_type='normal'),
init=mx.initializer.Xavier(magnitude=math.sqrt(2.0)),
allow_deferred_init=True))
self.rate_out = nn.Dense(units=len(rating_vals), flatten=False, use_bias=False)
def forward(self, graph, ufeat, ifeat):
"""Forward function.
Parameters
----------
graph : DGLHeteroGraph
"Flattened" user-movie graph with only one edge type.
ufeat : mx.nd.NDArray
User embeddings. Shape: (|V_u|, D)
ifeat : mx.nd.NDArray
Movie embeddings. Shape: (|V_m|, D)
Returns
-------
mx.nd.NDArray
Predicting scores for each user-movie edge.
"""
graph = graph.local_var()
ufeat = self.dropout(ufeat)
ifeat = self.dropout(ifeat)
graph.nodes['movie'].data['h'] = ifeat
basis_out = []
for i in range(self._num_basis_functions):
graph.nodes['user'].data['h'] = F.dot(ufeat, self.Ps[i].data())
graph.apply_edges(fn.u_dot_v('h', 'h', 'sr'))
basis_out.append(graph.edata['sr'].expand_dims(1))
out = F.concat(*basis_out, dim=1)
out = self.rate_out(out)
return out
def dot_or_identity(A, B):
# if A is None, treat as identity matrix
if A is None:
return B
else:
return mx.nd.dot(A, B)
"""Training script"""
import os, time
import argparse
import logging
import random
import string
import numpy as np
import mxnet as mx
from mxnet import gluon
from data import MovieLens
from model import GCMCLayer, BiDecoder
from utils import get_activation, parse_ctx, gluon_net_info, gluon_total_param_num, \
params_clip_global_norm, MetricLogger
from mxnet.gluon import Block
class Net(Block):
def __init__(self, args, **kwargs):
super(Net, self).__init__(**kwargs)
self._act = get_activation(args.model_activation)
with self.name_scope():
self.encoder = GCMCLayer(args.rating_vals,
args.src_in_units,
args.dst_in_units,
args.gcn_agg_units,
args.gcn_out_units,
args.gcn_dropout,
args.gcn_agg_accum,
agg_act=self._act,
share_user_item_param=args.share_param)
self.decoder = BiDecoder(args.rating_vals,
in_units=args.gcn_out_units,
num_basis_functions=args.gen_r_num_basis_func)
def forward(self, enc_graph, dec_graph, ufeat, ifeat):
user_out, movie_out = self.encoder(
enc_graph,
ufeat,
ifeat)
pred_ratings = self.decoder(dec_graph, user_out, movie_out)
return pred_ratings
def evaluate(args, net, dataset, segment='valid'):
possible_rating_values = dataset.possible_rating_values
nd_possible_rating_values = mx.nd.array(possible_rating_values, ctx=args.ctx, dtype=np.float32)
if segment == "valid":
rating_values = dataset.valid_truths
enc_graph = dataset.valid_enc_graph
dec_graph = dataset.valid_dec_graph
elif segment == "test":
rating_values = dataset.test_truths
enc_graph = dataset.test_enc_graph
dec_graph = dataset.test_dec_graph
else:
raise NotImplementedError
# Evaluate RMSE
with mx.autograd.predict_mode():
pred_ratings = net(enc_graph, dec_graph,
dataset.user_feature, dataset.movie_feature)
real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
nd_possible_rating_values.reshape((1, -1))).sum(axis=1)
rmse = mx.nd.square(real_pred_ratings - rating_values).mean().asscalar()
rmse = np.sqrt(rmse)
return rmse
def train(args):
print(args)
dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
print("Loading data finished ...\n")
args.src_in_units = dataset.user_feature_shape[1]
args.dst_in_units = dataset.movie_feature_shape[1]
args.rating_vals = dataset.possible_rating_values
### build the net
net = Net(args=args)
net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx)
net.hybridize()
nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32)
rating_loss_net = gluon.loss.SoftmaxCELoss()
rating_loss_net.hybridize()
trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr})
print("Loading network finished ...\n")
### perpare training data
train_gt_labels = dataset.train_labels
train_gt_ratings = dataset.train_truths
### prepare the logger
train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))
### declare the loss information
best_valid_rmse = np.inf
no_better_valid = 0
best_iter = -1
avg_gnorm = 0
count_rmse = 0
count_num = 0
count_loss = 0
print("Start training ...")
dur = []
for iter_idx in range(1, args.train_max_iter):
if iter_idx > 3:
t0 = time.time()
with mx.autograd.record():
pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph,
dataset.user_feature, dataset.movie_feature)
loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
loss.backward()
count_loss += loss.asscalar()
gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx)
avg_gnorm += gnorm
trainer.step(1.0)
if iter_idx > 3:
dur.append(time.time() - t0)
if iter_idx == 1:
print("Total #Param of net: %d" % (gluon_total_param_num(net)))
print(gluon_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))
real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
nd_possible_rating_values.reshape((1, -1))).sum(axis=1)
rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum()
count_rmse += rmse.asscalar()
count_num += pred_ratings.shape[0]
if iter_idx % args.train_log_interval == 0:
train_loss_logger.log(iter=iter_idx,
loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num)
logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
iter_idx, avg_gnorm/args.train_log_interval,
count_loss/iter_idx, count_rmse/count_num,
np.average(dur))
avg_gnorm = 0
count_rmse = 0
count_num = 0
if iter_idx % args.train_valid_interval == 0:
valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)
if valid_rmse < best_valid_rmse:
best_valid_rmse = valid_rmse
no_better_valid = 0
best_iter = iter_idx
#net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id)))
test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
best_test_rmse = test_rmse
test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
else:
no_better_valid += 1
if no_better_valid > args.train_early_stopping_patience\
and trainer.learning_rate <= args.train_min_lr:
logging.info("Early stopping threshold reached. Stop training.")
break
if no_better_valid > args.train_decay_patience:
new_lr = max(trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr)
if new_lr < trainer.learning_rate:
logging.info("\tChange the LR to %g" % new_lr)
trainer.set_learning_rate(new_lr)
no_better_valid = 0
if iter_idx % args.train_log_interval == 0:
print(logging_str)
print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
best_iter, best_valid_rmse, best_test_rmse))
train_loss_logger.close()
valid_loss_logger.close()
test_loss_logger.close()
def config():
parser = argparse.ArgumentParser(description='Run the baseline method.')
parser.add_argument('--seed', default=123, type=int)
parser.add_argument('--ctx', dest='ctx', default='gpu0', type=str,
help='Running Context. E.g `--ctx gpu` or `--ctx gpu0,gpu1` or `--ctx cpu`')
parser.add_argument('--save_dir', type=str, help='The saving directory')
parser.add_argument('--save_id', type=int, help='The saving log id')
parser.add_argument('--silent', action='store_true')
parser.add_argument('--data_name', default='ml-1m', type=str,
help='The dataset name: ml-100k, ml-1m, ml-10m')
parser.add_argument('--data_test_ratio', type=float, default=0.1) ## for ml-100k the test ration is 0.2
parser.add_argument('--data_valid_ratio', type=float, default=0.1)
parser.add_argument('--use_one_hot_fea', action='store_true', default=False)
#parser.add_argument('--model_remove_rating', type=bool, default=False)
parser.add_argument('--model_activation', type=str, default="leaky")
parser.add_argument('--gcn_dropout', type=float, default=0.7)
parser.add_argument('--gcn_agg_norm_symm', type=bool, default=True)
parser.add_argument('--gcn_agg_units', type=int, default=500)
parser.add_argument('--gcn_agg_accum', type=str, default="sum")
parser.add_argument('--gcn_out_units', type=int, default=75)
parser.add_argument('--gen_r_num_basis_func', type=int, default=2)
# parser.add_argument('--train_rating_batch_size', type=int, default=10000)
parser.add_argument('--train_max_iter', type=int, default=2000)
parser.add_argument('--train_log_interval', type=int, default=1)
parser.add_argument('--train_valid_interval', type=int, default=1)
parser.add_argument('--train_optimizer', type=str, default="adam")
parser.add_argument('--train_grad_clip', type=float, default=1.0)
parser.add_argument('--train_lr', type=float, default=0.01)
parser.add_argument('--train_min_lr', type=float, default=0.001)
parser.add_argument('--train_lr_decay_factor', type=float, default=0.5)
parser.add_argument('--train_decay_patience', type=int, default=50)
parser.add_argument('--train_early_stopping_patience', type=int, default=100)
parser.add_argument('--share_param', default=False, action='store_true')
args = parser.parse_args()
args.ctx = parse_ctx(args.ctx)[0]
### configure save_fir to save all the info
if args.save_dir is None:
args.save_dir = args.data_name+"_" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=2))
if args.save_id is None:
args.save_id = np.random.randint(20)
args.save_dir = os.path.join("log", args.save_dir)
if not os.path.isdir(args.save_dir):
os.makedirs(args.save_dir)
return args
if __name__ == '__main__':
args = config()
np.random.seed(args.seed)
mx.random.seed(args.seed, args.ctx)
train(args)
import ast
import os
import csv
import inspect
import logging
import re
import mxnet.ndarray as nd
from mxnet import gluon
from mxnet.gluon import nn
import mxnet as mx
import numpy as np
from collections import OrderedDict
class MetricLogger(object):
def __init__(self, attr_names, parse_formats, save_path):
self._attr_format_dict = OrderedDict(zip(attr_names, parse_formats))
self._file = open(save_path, 'w')
self._csv = csv.writer(self._file)
self._csv.writerow(attr_names)
self._file.flush()
def log(self, **kwargs):
self._csv.writerow([parse_format % kwargs[attr_name]
for attr_name, parse_format in self._attr_format_dict.items()])
self._file.flush()
def close(self):
self._file.close()
def parse_ctx(ctx_args):
ctx = re.findall('([a-z]+)(\d*)', ctx_args)
ctx = [(device, int(num)) if len(num) > 0 else (device, 0) for device, num in ctx]
ctx = [mx.Context(*ele) for ele in ctx]
return ctx
def gluon_total_param_num(net):
return sum([np.prod(v.shape) for v in net.collect_params().values()])
def gluon_net_info(net, save_path=None):
info_str = 'Total Param Number: {}\n'.format(gluon_total_param_num(net)) +\
'Params:\n'
for k, v in net.collect_params().items():
info_str += '\t{}: {}, {}\n'.format(k, v.shape, np.prod(v.shape))
info_str += str(net)
if save_path is not None:
with open(save_path, 'w') as f:
f.write(info_str)
return info_str
def params_clip_global_norm(param_dict, clip, ctx):
grads = [p.grad(ctx) for p in param_dict.values()]
gnorm = gluon.utils.clip_global_norm(grads, clip)
return gnorm
def get_activation(act):
"""Get the activation based on the act string
Parameters
----------
act: str or HybridBlock
Returns
-------
ret: HybridBlock
"""
if act is None:
return lambda x: x
if isinstance(act, str):
if act == 'leaky':
return nn.LeakyReLU(0.1)
elif act in ['relu', 'sigmoid', 'tanh', 'softrelu', 'softsign']:
return nn.Activation(act)
else:
raise NotImplementedError
else:
return act
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment