Unverified Commit d16fb3ff authored by Zihao Ye's avatar Zihao Ye Committed by GitHub
Browse files

[Example] GCMC in PyTorch (#1101)

* upd

* fix

* upd

* upd

* upd

* fix

* upd

* upd

* upd

* upd

* upd

* upd

* upd

* upd
parent 1552090a
# Graph Convolutional Matrix Completion
Paper link: [https://arxiv.org/abs/1706.02263](https://arxiv.org/abs/1706.02263)
Author's code: [https://github.com/riannevdberg/gc-mc](https://github.com/riannevdberg/gc-mc)
The implementation does not handle side-channel features and mini-epoching and thus achieves
slightly worse performance when using node features.
Credit: Jiani Zhang ([@jennyzhang0215](https://github.com/jennyzhang0215))
## Dependencies
* PyTorch 1.2+
* pandas
* torchtext 0.4+
## Data
Supported datasets: ml-100k, ml-1m, ml-10m
## How to run
ml-100k, no feature
```bash
python train.py --data_name=ml-100k --use_one_hot_fea --gcn_agg_accum=stack
```
Results: RMSE=0.9088 (0.910 reported)
Speed: 0.0195s/epoch (vanilla implementation: 0.1008s/epoch)
ml-100k, with feature
```bash
python train.py --data_name=ml-100k --gcn_agg_accum=stack
```
Results: RMSE=0.9448 (0.905 reported)
ml-1m, no feature
```bash
python train.py --data_name=ml-1m --gcn_agg_accum=sum --use_one_hot_fea
```
Results: RMSE=0.8377 (0.832 reported)
Speed: 0.0557s/epoch (vanilla implementation: 1.538s/epoch)
ml-10m, no feature
```bash
python train.py --data_name=ml-10m --gcn_agg_accum=stack --gcn_dropout=0.3 \
--train_lr=0.001 --train_min_lr=0.0001 --train_max_iter=15000 \
--use_one_hot_fea --gen_r_num_basis_func=4
```
Results: RMSE=0.7800 (0.777 reported)
Speed: 0.9207/epoch (vanilla implementation: OOM)
Testbed: EC2 p3.2xlarge instance(Amazon Linux 2)
\ No newline at end of file
"""MovieLens dataset"""
import numpy as np
import os
import re
import pandas as pd
import scipy.sparse as sp
import torch as th
from torchtext import data
from torchtext.vocab import GloVe
import dgl
from dgl.data.utils import download, extract_archive, get_download_dir
_urls = {
'ml-100k' : 'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
'ml-1m' : 'http://files.grouplens.org/datasets/movielens/ml-1m.zip',
'ml-10m' : 'http://files.grouplens.org/datasets/movielens/ml-10m.zip',
}
READ_DATASET_PATH = get_download_dir()
GENRES_ML_100K =\
['unknown', 'Action', 'Adventure', 'Animation',
'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
'Thriller', 'War', 'Western']
GENRES_ML_1M = GENRES_ML_100K[1:]
GENRES_ML_10M = GENRES_ML_100K + ['IMAX']
class MovieLens(object):
"""MovieLens dataset used by GCMC model
TODO(minjie): make this dataset more general
The dataset stores MovieLens ratings in two types of graphs. The encoder graph
contains rating value information in the form of edge types. The decoder graph
stores plain user-movie pairs in the form of a bipartite graph with no rating
information. All graphs have two types of nodes: "user" and "movie".
The training, validation and test set can be summarized as follows:
training_enc_graph : training user-movie pairs + rating info
training_dec_graph : training user-movie pairs
valid_enc_graph : training user-movie pairs + rating info
valid_dec_graph : validation user-movie pairs
test_enc_graph : training user-movie pairs + validation user-movie pairs + rating info
test_dec_graph : test user-movie pairs
Attributes
----------
train_enc_graph : dgl.DGLHeteroGraph
Encoder graph for training.
train_dec_graph : dgl.DGLHeteroGraph
Decoder graph for training.
train_labels : torch.Tensor
The categorical label of each user-movie pair
train_truths : torch.Tensor
The actual rating values of each user-movie pair
valid_enc_graph : dgl.DGLHeteroGraph
Encoder graph for validation.
valid_dec_graph : dgl.DGLHeteroGraph
Decoder graph for validation.
valid_labels : torch.Tensor
The categorical label of each user-movie pair
valid_truths : torch.Tensor
The actual rating values of each user-movie pair
test_enc_graph : dgl.DGLHeteroGraph
Encoder graph for test.
test_dec_graph : dgl.DGLHeteroGraph
Decoder graph for test.
test_labels : torch.Tensor
The categorical label of each user-movie pair
test_truths : torch.Tensor
The actual rating values of each user-movie pair
user_feature : torch.Tensor
User feature tensor. If None, representing an identity matrix.
movie_feature : torch.Tensor
Movie feature tensor. If None, representing an identity matrix.
possible_rating_values : np.ndarray
Available rating values in the dataset
Parameters
----------
name : str
Dataset name. Could be "ml-100k", "ml-1m", "ml-10m"
device : torch.device
Device context
use_one_hot_fea : bool, optional
If true, the ``user_feature`` attribute is None, representing an one-hot identity
matrix. (Default: False)
symm : bool, optional
If true, the use symmetric normalize constant. Otherwise, use left normalize
constant. (Default: True)
test_ratio : float, optional
Ratio of test data
valid_ratio : float, optional
Ratio of validation data
"""
def __init__(self, name, device, use_one_hot_fea=False, symm=True,
test_ratio=0.1, valid_ratio=0.1):
self._name = name
self._device = device
self._symm = symm
self._test_ratio = test_ratio
self._valid_ratio = valid_ratio
# download and extract
download_dir = get_download_dir()
zip_file_path = '{}/{}.zip'.format(download_dir, name)
download(_urls[name], path=zip_file_path)
extract_archive(zip_file_path, '{}/{}'.format(download_dir, name))
if name == 'ml-10m':
root_folder = 'ml-10M100K'
else:
root_folder = name
self._dir = os.path.join(download_dir, name, root_folder)
print("Starting processing {} ...".format(self._name))
self._load_raw_user_info()
self._load_raw_movie_info()
print('......')
if self._name == 'ml-100k':
self.all_train_rating_info = self._load_raw_rates(os.path.join(self._dir, 'u1.base'), '\t')
self.test_rating_info = self._load_raw_rates(os.path.join(self._dir, 'u1.test'), '\t')
self.all_rating_info = pd.concat([self.all_train_rating_info, self.test_rating_info])
elif self._name == 'ml-1m' or self._name == 'ml-10m':
self.all_rating_info = self._load_raw_rates(os.path.join(self._dir, 'ratings.dat'), '::')
num_test = int(np.ceil(self.all_rating_info.shape[0] * self._test_ratio))
shuffled_idx = np.random.permutation(self.all_rating_info.shape[0])
self.test_rating_info = self.all_rating_info.iloc[shuffled_idx[: num_test]]
self.all_train_rating_info = self.all_rating_info.iloc[shuffled_idx[num_test: ]]
else:
raise NotImplementedError
print('......')
num_valid = int(np.ceil(self.all_train_rating_info.shape[0] * self._valid_ratio))
shuffled_idx = np.random.permutation(self.all_train_rating_info.shape[0])
self.valid_rating_info = self.all_train_rating_info.iloc[shuffled_idx[: num_valid]]
self.train_rating_info = self.all_train_rating_info.iloc[shuffled_idx[num_valid: ]]
self.possible_rating_values = np.unique(self.train_rating_info["rating"].values)
print("All rating pairs : {}".format(self.all_rating_info.shape[0]))
print("\tAll train rating pairs : {}".format(self.all_train_rating_info.shape[0]))
print("\t\tTrain rating pairs : {}".format(self.train_rating_info.shape[0]))
print("\t\tValid rating pairs : {}".format(self.valid_rating_info.shape[0]))
print("\tTest rating pairs : {}".format(self.test_rating_info.shape[0]))
self.user_info = self._drop_unseen_nodes(orign_info=self.user_info,
cmp_col_name="id",
reserved_ids_set=set(self.all_rating_info["user_id"].values),
label="user")
self.movie_info = self._drop_unseen_nodes(orign_info=self.movie_info,
cmp_col_name="id",
reserved_ids_set=set(self.all_rating_info["movie_id"].values),
label="movie")
# Map user/movie to the global id
self.global_user_id_map = {ele: i for i, ele in enumerate(self.user_info['id'])}
self.global_movie_id_map = {ele: i for i, ele in enumerate(self.movie_info['id'])}
print('Total user number = {}, movie number = {}'.format(len(self.global_user_id_map),
len(self.global_movie_id_map)))
self._num_user = len(self.global_user_id_map)
self._num_movie = len(self.global_movie_id_map)
### Generate features
if use_one_hot_fea:
self.user_feature = None
self.movie_feature = None
else:
self.user_feature = th.FloatTensor(self._process_user_fea()).to(device)
self.movie_feature = th.FloatTensor(self._process_movie_fea()).to(device)
if self.user_feature is None:
self.user_feature_shape = (self.num_user, self.num_user)
self.movie_feature_shape = (self.num_movie, self.num_movie)
else:
self.user_feature_shape = self.user_feature.shape
self.movie_feature_shape = self.movie_feature.shape
info_line = "Feature dim: "
info_line += "\nuser: {}".format(self.user_feature_shape)
info_line += "\nmovie: {}".format(self.movie_feature_shape)
print(info_line)
all_train_rating_pairs, all_train_rating_values = self._generate_pair_value(self.all_train_rating_info)
train_rating_pairs, train_rating_values = self._generate_pair_value(self.train_rating_info)
valid_rating_pairs, valid_rating_values = self._generate_pair_value(self.valid_rating_info)
test_rating_pairs, test_rating_values = self._generate_pair_value(self.test_rating_info)
def _make_labels(ratings):
labels = th.LongTensor(np.searchsorted(self.possible_rating_values, ratings)).to(device)
return labels
self.train_enc_graph = self._generate_enc_graph(train_rating_pairs, train_rating_values, add_support=True)
self.train_dec_graph = self._generate_dec_graph(train_rating_pairs)
self.train_labels = _make_labels(train_rating_values)
self.train_truths = th.FloatTensor(train_rating_values).to(device)
self.valid_enc_graph = self.train_enc_graph
self.valid_dec_graph = self._generate_dec_graph(valid_rating_pairs)
self.valid_labels = _make_labels(valid_rating_values)
self.valid_truths = th.FloatTensor(valid_rating_values).to(device)
self.test_enc_graph = self._generate_enc_graph(all_train_rating_pairs, all_train_rating_values, add_support=True)
self.test_dec_graph = self._generate_dec_graph(test_rating_pairs)
self.test_labels = _make_labels(test_rating_values)
self.test_truths = th.FloatTensor(test_rating_values).to(device)
def _npairs(graph):
rst = 0
for r in self.possible_rating_values:
rst += graph.number_of_edges(str(r))
return rst
print("Train enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.train_enc_graph.number_of_nodes('user'), self.train_enc_graph.number_of_nodes('movie'),
_npairs(self.train_enc_graph)))
print("Train dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.train_dec_graph.number_of_nodes('user'), self.train_dec_graph.number_of_nodes('movie'),
self.train_dec_graph.number_of_edges()))
print("Valid enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.valid_enc_graph.number_of_nodes('user'), self.valid_enc_graph.number_of_nodes('movie'),
_npairs(self.valid_enc_graph)))
print("Valid dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.valid_dec_graph.number_of_nodes('user'), self.valid_dec_graph.number_of_nodes('movie'),
self.valid_dec_graph.number_of_edges()))
print("Test enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.test_enc_graph.number_of_nodes('user'), self.test_enc_graph.number_of_nodes('movie'),
_npairs(self.test_enc_graph)))
print("Test dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
self.test_dec_graph.number_of_nodes('user'), self.test_dec_graph.number_of_nodes('movie'),
self.test_dec_graph.number_of_edges()))
def _generate_pair_value(self, rating_info):
rating_pairs = (np.array([self.global_user_id_map[ele] for ele in rating_info["user_id"]],
dtype=np.int64),
np.array([self.global_movie_id_map[ele] for ele in rating_info["movie_id"]],
dtype=np.int64))
rating_values = rating_info["rating"].values.astype(np.float32)
return rating_pairs, rating_values
def _generate_enc_graph(self, rating_pairs, rating_values, add_support=False):
user_movie_R = np.zeros((self._num_user, self._num_movie), dtype=np.float32)
user_movie_R[rating_pairs] = rating_values
movie_user_R = user_movie_R.transpose()
rating_graphs = []
rating_row, rating_col = rating_pairs
for rating in self.possible_rating_values:
ridx = np.where(rating_values == rating)
rrow = rating_row[ridx]
rcol = rating_col[ridx]
bg = dgl.bipartite((rrow, rcol), 'user', str(rating), 'movie',
card=(self._num_user, self._num_movie))
rev_bg = dgl.bipartite((rcol, rrow), 'movie', 'rev-%s' % str(rating), 'user',
card=(self._num_movie, self._num_user))
rating_graphs.append(bg)
rating_graphs.append(rev_bg)
graph = dgl.hetero_from_relations(rating_graphs)
# sanity check
assert len(rating_pairs[0]) == sum([graph.number_of_edges(et) for et in graph.etypes]) // 2
if add_support:
def _calc_norm(x):
x = x.numpy().astype('float32')
x[x == 0.] = np.inf
x = th.FloatTensor(1. / np.sqrt(x))
return x.to(self._device).unsqueeze(1)
user_ci = []
user_cj = []
movie_ci = []
movie_cj = []
for r in self.possible_rating_values:
r = str(r)
user_ci.append(graph['rev-%s' % r].in_degrees())
movie_ci.append(graph[r].in_degrees())
if self._symm:
user_cj.append(graph[r].out_degrees())
movie_cj.append(graph['rev-%s' % r].out_degrees())
else:
user_cj.append(th.zeros((self.num_user,)))
movie_cj.append(th.zeros((self.num_movie,)))
user_ci = _calc_norm(sum(user_ci))
movie_ci = _calc_norm(sum(movie_ci))
if self._symm:
user_cj = _calc_norm(sum(user_cj))
movie_cj = _calc_norm(sum(movie_cj))
else:
user_cj = th.ones(self.num_user,).to(self._device)
movie_cj = th.ones(self.num_movie,).to(self._device)
graph.nodes['user'].data.update({'ci' : user_ci, 'cj' : user_cj})
graph.nodes['movie'].data.update({'ci' : movie_ci, 'cj' : movie_cj})
return graph
def _generate_dec_graph(self, rating_pairs):
ones = np.ones_like(rating_pairs[0])
user_movie_ratings_coo = sp.coo_matrix(
(ones, rating_pairs),
shape=(self.num_user, self.num_movie), dtype=np.float32)
return dgl.bipartite(user_movie_ratings_coo, 'user', 'rate', 'movie')
@property
def num_links(self):
return self.possible_rating_values.size
@property
def num_user(self):
return self._num_user
@property
def num_movie(self):
return self._num_movie
def _drop_unseen_nodes(self, orign_info, cmp_col_name, reserved_ids_set, label):
# print(" -----------------")
# print("{}: {}(reserved) v.s. {}(from info)".format(label, len(reserved_ids_set),
# len(set(orign_info[cmp_col_name].values))))
if reserved_ids_set != set(orign_info[cmp_col_name].values):
pd_rating_ids = pd.DataFrame(list(reserved_ids_set), columns=["id_graph"])
# print("\torign_info: ({}, {})".format(orign_info.shape[0], orign_info.shape[1]))
data_info = orign_info.merge(pd_rating_ids, left_on=cmp_col_name, right_on='id_graph', how='outer')
data_info = data_info.dropna(subset=[cmp_col_name, 'id_graph'])
data_info = data_info.drop(columns=["id_graph"])
data_info = data_info.reset_index(drop=True)
# print("\tAfter dropping, data shape: ({}, {})".format(data_info.shape[0], data_info.shape[1]))
return data_info
else:
orign_info = orign_info.reset_index(drop=True)
return orign_info
def _load_raw_rates(self, file_path, sep):
"""In MovieLens, the rates have the following format
ml-100k
user id \t movie id \t rating \t timestamp
ml-1m/10m
UserID::MovieID::Rating::Timestamp
timestamp is unix timestamp and can be converted by pd.to_datetime(X, unit='s')
Parameters
----------
file_path : str
Returns
-------
rating_info : pd.DataFrame
"""
rating_info = pd.read_csv(
file_path, sep=sep, header=None,
names=['user_id', 'movie_id', 'rating', 'timestamp'],
dtype={'user_id': np.int32, 'movie_id' : np.int32,
'ratings': np.float32, 'timestamp': np.int64}, engine='python')
return rating_info
def _load_raw_user_info(self):
"""In MovieLens, the user attributes file have the following formats:
ml-100k:
user id | age | gender | occupation | zip code
ml-1m:
UserID::Gender::Age::Occupation::Zip-code
For ml-10m, there is no user information. We read the user id from the rating file.
Parameters
----------
name : str
Returns
-------
user_info : pd.DataFrame
"""
if self._name == 'ml-100k':
self.user_info = pd.read_csv(os.path.join(self._dir, 'u.user'), sep='|', header=None,
names=['id', 'age', 'gender', 'occupation', 'zip_code'], engine='python')
elif self._name == 'ml-1m':
self.user_info = pd.read_csv(os.path.join(self._dir, 'users.dat'), sep='::', header=None,
names=['id', 'gender', 'age', 'occupation', 'zip_code'], engine='python')
elif self._name == 'ml-10m':
rating_info = pd.read_csv(
os.path.join(self._dir, 'ratings.dat'), sep='::', header=None,
names=['user_id', 'movie_id', 'rating', 'timestamp'],
dtype={'user_id': np.int32, 'movie_id': np.int32, 'ratings': np.float32,
'timestamp': np.int64}, engine='python')
self.user_info = pd.DataFrame(np.unique(rating_info['user_id'].values.astype(np.int32)),
columns=['id'])
else:
raise NotImplementedError
def _process_user_fea(self):
"""
Parameters
----------
user_info : pd.DataFrame
name : str
For ml-100k and ml-1m, the column name is ['id', 'gender', 'age', 'occupation', 'zip_code'].
We take the age, gender, and the one-hot encoding of the occupation as the user features.
For ml-10m, there is no user feature and we set the feature to be a single zero.
Returns
-------
user_features : np.ndarray
"""
if self._name == 'ml-100k' or self._name == 'ml-1m':
ages = self.user_info['age'].values.astype(np.float32)
gender = (self.user_info['gender'] == 'F').values.astype(np.float32)
all_occupations = set(self.user_info['occupation'])
occupation_map = {ele: i for i, ele in enumerate(all_occupations)}
occupation_one_hot = np.zeros(shape=(self.user_info.shape[0], len(all_occupations)),
dtype=np.float32)
occupation_one_hot[np.arange(self.user_info.shape[0]),
np.array([occupation_map[ele] for ele in self.user_info['occupation']])] = 1
user_features = np.concatenate([ages.reshape((self.user_info.shape[0], 1)) / 50.0,
gender.reshape((self.user_info.shape[0], 1)),
occupation_one_hot], axis=1)
elif self._name == 'ml-10m':
user_features = np.zeros(shape=(self.user_info.shape[0], 1), dtype=np.float32)
else:
raise NotImplementedError
return user_features
def _load_raw_movie_info(self):
"""In MovieLens, the movie attributes may have the following formats:
In ml_100k:
movie id | movie title | release date | video release date | IMDb URL | [genres]
In ml_1m, ml_10m:
MovieID::Title (Release Year)::Genres
Also, Genres are separated by |, e.g., Adventure|Animation|Children|Comedy|Fantasy
Parameters
----------
name : str
Returns
-------
movie_info : pd.DataFrame
For ml-100k, the column name is ['id', 'title', 'release_date', 'video_release_date', 'url'] + [GENRES (19)]]
For ml-1m and ml-10m, the column name is ['id', 'title'] + [GENRES (18/20)]]
"""
if self._name == 'ml-100k':
GENRES = GENRES_ML_100K
elif self._name == 'ml-1m':
GENRES = GENRES_ML_1M
elif self._name == 'ml-10m':
GENRES = GENRES_ML_10M
else:
raise NotImplementedError
if self._name == 'ml-100k':
file_path = os.path.join(self._dir, 'u.item')
self.movie_info = pd.read_csv(file_path, sep='|', header=None,
names=['id', 'title', 'release_date', 'video_release_date', 'url'] + GENRES,
engine='python')
elif self._name == 'ml-1m' or self._name == 'ml-10m':
file_path = os.path.join(self._dir, 'movies.dat')
movie_info = pd.read_csv(file_path, sep='::', header=None,
names=['id', 'title', 'genres'], engine='python')
genre_map = {ele: i for i, ele in enumerate(GENRES)}
genre_map['Children\'s'] = genre_map['Children']
genre_map['Childrens'] = genre_map['Children']
movie_genres = np.zeros(shape=(movie_info.shape[0], len(GENRES)), dtype=np.float32)
for i, genres in enumerate(movie_info['genres']):
for ele in genres.split('|'):
if ele in genre_map:
movie_genres[i, genre_map[ele]] = 1.0
else:
print('genres not found, filled with unknown: {}'.format(genres))
movie_genres[i, genre_map['unknown']] = 1.0
for idx, genre_name in enumerate(GENRES):
assert idx == genre_map[genre_name]
movie_info[genre_name] = movie_genres[:, idx]
self.movie_info = movie_info.drop(columns=["genres"])
else:
raise NotImplementedError
def _process_movie_fea(self):
"""
Parameters
----------
movie_info : pd.DataFrame
name : str
Returns
-------
movie_features : np.ndarray
Generate movie features by concatenating embedding and the year
"""
if self._name == 'ml-100k':
GENRES = GENRES_ML_100K
elif self._name == 'ml-1m':
GENRES = GENRES_ML_1M
elif self._name == 'ml-10m':
GENRES = GENRES_ML_10M
else:
raise NotImplementedError
TEXT = data.Field(tokenize='spacy')
embedding = GloVe(name='840B', dim=300)
title_embedding = np.zeros(shape=(self.movie_info.shape[0], 300), dtype=np.float32)
release_years = np.zeros(shape=(self.movie_info.shape[0], 1), dtype=np.float32)
p = re.compile(r'(.+)\s*\((\d+)\)')
for i, title in enumerate(self.movie_info['title']):
match_res = p.match(title)
if match_res is None:
print('{} cannot be matched, index={}, name={}'.format(title, i, self._name))
title_context, year = title, 1950
else:
title_context, year = match_res.groups()
# We use average of glove
title_embedding[i, :] = embedding.get_vecs_by_tokens(TEXT.tokenize(title_context)).numpy().mean(axis=0)
release_years[i] = float(year)
movie_features = np.concatenate((title_embedding,
(release_years - 1950.0) / 100.0,
self.movie_info[GENRES]),
axis=1)
return movie_features
if __name__ == '__main__':
MovieLens("ml-100k", device=th.device('cpu'), symm=True)
"""NN modules"""
import torch as th
import torch.nn as nn
import dgl.function as fn
from utils import get_activation
class GCMCLayer(nn.Module):
r"""GCMC layer
.. math::
z_j^{(l+1)} = \sigma_{agg}\left[\mathrm{agg}\left(
\sum_{j\in\mathcal{N}_1}\frac{1}{c_{ij}}W_1h_j, \ldots,
\sum_{j\in\mathcal{N}_R}\frac{1}{c_{ij}}W_Rh_j
\right)\right]
After that, apply an extra output projection:
.. math::
h_j^{(l+1)} = \sigma_{out}W_oz_j^{(l+1)}
The equation is applied to both user nodes and movie nodes and the parameters
are not shared unless ``share_user_item_param`` is true.
Parameters
----------
rating_vals : list of int or float
Possible rating values.
user_in_units : int
Size of user input feature
movie_in_units : int
Size of movie input feature
msg_units : int
Size of message :math:`W_rh_j`
out_units : int
Size of of final output user and movie features
dropout_rate : float, optional
Dropout rate (Default: 0.0)
agg : str, optional
Function to aggregate messages of different ratings.
Could be any of the supported cross type reducers:
"sum", "max", "min", "mean", "stack".
(Default: "stack")
agg_act : callable, str, optional
Activation function :math:`sigma_{agg}`. (Default: None)
out_act : callable, str, optional
Activation function :math:`sigma_{agg}`. (Default: None)
share_user_item_param : bool, optional
If true, user node and movie node share the same set of parameters.
Require ``user_in_units`` and ``move_in_units`` to be the same.
(Default: False)
"""
def __init__(self,
rating_vals,
user_in_units,
movie_in_units,
msg_units,
out_units,
dropout_rate=0.0,
agg='stack', # or 'sum'
agg_act=None,
out_act=None,
share_user_item_param=False):
super(GCMCLayer, self).__init__()
self.rating_vals = rating_vals
self.agg = agg
self.share_user_item_param = share_user_item_param
self.ufc = nn.Linear(msg_units, out_units)
if share_user_item_param:
self.ifc = self.ufc
else:
self.ifc = nn.Linear(msg_units, out_units)
if agg == 'stack':
# divide the original msg unit size by number of ratings to keep
# the dimensionality
assert msg_units % len(rating_vals) == 0
msg_units = msg_units // len(rating_vals)
self.dropout = nn.Dropout(dropout_rate)
self.W_r = nn.ParameterDict()
for rating in rating_vals:
# PyTorch parameter name can't contain "."
rating = str(rating).replace('.', '_')
if share_user_item_param and user_in_units == movie_in_units:
self.W_r[rating] = nn.Parameter(th.randn(user_in_units, msg_units))
self.W_r['rev-%s' % rating] = self.W_r[rating]
else:
self.W_r[rating] = nn.Parameter(th.randn(user_in_units, msg_units))
self.W_r['rev-%s' % rating] = nn.Parameter(th.randn(movie_in_units, msg_units))
self.agg_act = get_activation(agg_act)
self.out_act = get_activation(out_act)
self.reset_parameters()
def reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, graph, ufeat=None, ifeat=None):
"""Forward function
Normalizer constant :math:`c_{ij}` is stored as two node data "ci"
and "cj".
Parameters
----------
graph : DGLHeteroGraph
User-movie rating graph. It should contain two node types: "user"
and "movie" and many edge types each for one rating value.
ufeat : torch.Tensor, optional
User features. If None, using an identity matrix.
ifeat : torch.Tensor, optional
Movie features. If None, using an identity matrix.
Returns
-------
new_ufeat : torch.Tensor
New user features
new_ifeat : torch.Tensor
New movie features
"""
num_u = graph.number_of_nodes('user')
num_i = graph.number_of_nodes('movie')
funcs = {}
for i, rating in enumerate(self.rating_vals):
rating = str(rating)
# W_r * x
x_u = dot_or_identity(ufeat, self.W_r[rating.replace('.', '_')])
x_i = dot_or_identity(ifeat, self.W_r['rev-%s' % rating.replace('.', '_')])
# left norm and dropout
x_u = x_u * self.dropout(graph.nodes['user'].data['cj'])
x_i = x_i * self.dropout(graph.nodes['movie'].data['cj'])
graph.nodes['user'].data['h%d' % i] = x_u
graph.nodes['movie'].data['h%d' % i] = x_i
funcs[rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h'))
funcs['rev-%s' % rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h'))
# message passing
graph.multi_update_all(funcs, self.agg)
ufeat = graph.nodes['user'].data.pop('h').view(num_u, -1)
ifeat = graph.nodes['movie'].data.pop('h').view(num_i, -1)
# right norm
ufeat = ufeat * graph.nodes['user'].data['ci']
ifeat = ifeat * graph.nodes['movie'].data['ci']
# fc and non-linear
ufeat = self.agg_act(ufeat)
ifeat = self.agg_act(ifeat)
ufeat = self.dropout(ufeat)
ifeat = self.dropout(ifeat)
ufeat = self.ufc(ufeat)
ifeat = self.ifc(ifeat)
return self.out_act(ufeat), self.out_act(ifeat)
class BiDecoder(nn.Module):
r"""Bilinear decoder.
.. math::
p(M_{ij}=r) = \text{softmax}(u_i^TQ_rv_j)
The trainable parameter :math:`Q_r` is further decomposed to a linear
combination of basis weight matrices :math:`P_s`:
.. math::
Q_r = \sum_{s=1}^{b} a_{rs}P_s
Parameters
----------
rating_vals : list of int or float
Possible rating values.
in_units : int
Size of input user and movie features
num_basis_functions : int, optional
Number of basis. (Default: 2)
dropout_rate : float, optional
Dropout raite (Default: 0.0)
"""
def __init__(self,
rating_vals,
in_units,
num_basis_functions=2,
dropout_rate=0.0):
super(BiDecoder, self).__init__()
self.rating_vals = rating_vals
self._num_basis_functions = num_basis_functions
self.dropout = nn.Dropout(dropout_rate)
self.Ps = nn.ParameterList()
for i in range(num_basis_functions):
self.Ps.append(nn.Parameter(th.randn(in_units, in_units)))
self.rate_out = nn.Linear(self._num_basis_functions, len(rating_vals), bias=False)
self.reset_parameters()
def reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
def forward(self, graph, ufeat, ifeat):
"""Forward function.
Parameters
----------
graph : DGLHeteroGraph
"Flattened" user-movie graph with only one edge type.
ufeat : th.Tensor
User embeddings. Shape: (|V_u|, D)
ifeat : th.Tensor
Movie embeddings. Shape: (|V_m|, D)
Returns
-------
th.Tensor
Predicting scores for each user-movie edge.
"""
graph = graph.local_var()
ufeat = self.dropout(ufeat)
ifeat = self.dropout(ifeat)
graph.nodes['movie'].data['h'] = ifeat
basis_out = []
for i in range(self._num_basis_functions):
graph.nodes['user'].data['h'] = ufeat @ self.Ps[i]
graph.apply_edges(fn.u_dot_v('h', 'h', 'sr'))
basis_out.append(graph.edata['sr'].unsqueeze(1))
out = th.cat(basis_out, dim=1)
out = self.rate_out(out)
return out
def dot_or_identity(A, B):
# if A is None, treat as identity matrix
if A is None:
return B
else:
return A @ B
"""Training script"""
import os, time
import argparse
import logging
import random
import string
import numpy as np
import torch as th
import torch.nn as nn
from data import MovieLens
from model import GCMCLayer, BiDecoder
from utils import get_activation, get_optimizer, torch_total_param_num, torch_net_info, MetricLogger
class Net(nn.Module):
def __init__(self, args):
super(Net, self).__init__()
self._act = get_activation(args.model_activation)
self.encoder = GCMCLayer(args.rating_vals,
args.src_in_units,
args.dst_in_units,
args.gcn_agg_units,
args.gcn_out_units,
args.gcn_dropout,
args.gcn_agg_accum,
agg_act=self._act,
share_user_item_param=args.share_param)
self.decoder = BiDecoder(args.rating_vals,
in_units=args.gcn_out_units,
num_basis_functions=args.gen_r_num_basis_func)
def forward(self, enc_graph, dec_graph, ufeat, ifeat):
user_out, movie_out = self.encoder(
enc_graph,
ufeat,
ifeat)
pred_ratings = self.decoder(dec_graph, user_out, movie_out)
return pred_ratings
def evaluate(args, net, dataset, segment='valid'):
possible_rating_values = dataset.possible_rating_values
nd_possible_rating_values = th.FloatTensor(possible_rating_values).to(args.device)
if segment == "valid":
rating_values = dataset.valid_truths
enc_graph = dataset.valid_enc_graph
dec_graph = dataset.valid_dec_graph
elif segment == "test":
rating_values = dataset.test_truths
enc_graph = dataset.test_enc_graph
dec_graph = dataset.test_dec_graph
else:
raise NotImplementedError
# Evaluate RMSE
net.eval()
with th.no_grad():
pred_ratings = net(enc_graph, dec_graph,
dataset.user_feature, dataset.movie_feature)
real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
nd_possible_rating_values.view(1, -1)).sum(dim=1)
rmse = ((real_pred_ratings - rating_values) ** 2.).mean().item()
rmse = np.sqrt(rmse)
return rmse
def train(args):
print(args)
dataset = MovieLens(args.data_name, args.device, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
print("Loading data finished ...\n")
args.src_in_units = dataset.user_feature_shape[1]
args.dst_in_units = dataset.movie_feature_shape[1]
args.rating_vals = dataset.possible_rating_values
### build the net
net = Net(args=args)
net = net.to(args.device)
nd_possible_rating_values = th.FloatTensor(dataset.possible_rating_values).to(args.device)
rating_loss_net = nn.CrossEntropyLoss()
learning_rate = args.train_lr
optimizer = get_optimizer(args.train_optimizer)(net.parameters(), lr=learning_rate)
print("Loading network finished ...\n")
### perpare training data
train_gt_labels = dataset.train_labels
train_gt_ratings = dataset.train_truths
### prepare the logger
train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))
### declare the loss information
best_valid_rmse = np.inf
no_better_valid = 0
best_iter = -1
count_rmse = 0
count_num = 0
count_loss = 0
print("Start training ...")
dur = []
for iter_idx in range(1, args.train_max_iter):
if iter_idx > 3:
t0 = time.time()
net.train()
pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph,
dataset.user_feature, dataset.movie_feature)
loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
count_loss += loss.item()
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
optimizer.step()
if iter_idx > 3:
dur.append(time.time() - t0)
if iter_idx == 1:
print("Total #Param of net: %d" % (torch_total_param_num(net)))
print(torch_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))
real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
nd_possible_rating_values.view(1, -1)).sum(dim=1)
rmse = ((real_pred_ratings - train_gt_ratings) ** 2).sum()
count_rmse += rmse.item()
count_num += pred_ratings.shape[0]
if iter_idx % args.train_log_interval == 0:
train_loss_logger.log(iter=iter_idx,
loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num)
logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
iter_idx, count_loss/iter_idx, count_rmse/count_num,
np.average(dur))
count_rmse = 0
count_num = 0
if iter_idx % args.train_valid_interval == 0:
valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)
if valid_rmse < best_valid_rmse:
best_valid_rmse = valid_rmse
no_better_valid = 0
best_iter = iter_idx
test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
best_test_rmse = test_rmse
test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
else:
no_better_valid += 1
if no_better_valid > args.train_early_stopping_patience\
and learning_rate <= args.train_min_lr:
logging.info("Early stopping threshold reached. Stop training.")
break
if no_better_valid > args.train_decay_patience:
new_lr = max(learning_rate * args.train_lr_decay_factor, args.train_min_lr)
if new_lr < learning_rate:
logging.info("\tChange the LR to %g" % new_lr)
for p in optimizer.param_groups:
p['lr'] = learning_rate
no_better_valid = 0
if iter_idx % args.train_log_interval == 0:
print(logging_str)
print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
best_iter, best_valid_rmse, best_test_rmse))
train_loss_logger.close()
valid_loss_logger.close()
test_loss_logger.close()
def config():
parser = argparse.ArgumentParser(description='GCMC')
parser.add_argument('--seed', default=123, type=int)
parser.add_argument('--device', default='0', type=int,
help='Running device. E.g `--device 0`, if using cpu, set `--device -1`')
parser.add_argument('--save_dir', type=str, help='The saving directory')
parser.add_argument('--save_id', type=int, help='The saving log id')
parser.add_argument('--silent', action='store_true')
parser.add_argument('--data_name', default='ml-1m', type=str,
help='The dataset name: ml-100k, ml-1m, ml-10m')
parser.add_argument('--data_test_ratio', type=float, default=0.1) ## for ml-100k the test ration is 0.2
parser.add_argument('--data_valid_ratio', type=float, default=0.1)
parser.add_argument('--use_one_hot_fea', action='store_true', default=False)
parser.add_argument('--model_activation', type=str, default="leaky")
parser.add_argument('--gcn_dropout', type=float, default=0.7)
parser.add_argument('--gcn_agg_norm_symm', type=bool, default=True)
parser.add_argument('--gcn_agg_units', type=int, default=500)
parser.add_argument('--gcn_agg_accum', type=str, default="sum")
parser.add_argument('--gcn_out_units', type=int, default=75)
parser.add_argument('--gen_r_num_basis_func', type=int, default=2)
parser.add_argument('--train_max_iter', type=int, default=2000)
parser.add_argument('--train_log_interval', type=int, default=1)
parser.add_argument('--train_valid_interval', type=int, default=1)
parser.add_argument('--train_optimizer', type=str, default="adam")
parser.add_argument('--train_grad_clip', type=float, default=1.0)
parser.add_argument('--train_lr', type=float, default=0.01)
parser.add_argument('--train_min_lr', type=float, default=0.001)
parser.add_argument('--train_lr_decay_factor', type=float, default=0.5)
parser.add_argument('--train_decay_patience', type=int, default=50)
parser.add_argument('--train_early_stopping_patience', type=int, default=100)
parser.add_argument('--share_param', default=False, action='store_true')
args = parser.parse_args()
args.device = th.device(args.device) if args.device >= 0 else th.device('cpu')
### configure save_fir to save all the info
if args.save_dir is None:
args.save_dir = args.data_name+"_" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=2))
if args.save_id is None:
args.save_id = np.random.randint(20)
args.save_dir = os.path.join("log", args.save_dir)
if not os.path.isdir(args.save_dir):
os.makedirs(args.save_dir)
return args
if __name__ == '__main__':
args = config()
np.random.seed(args.seed)
th.manual_seed(args.seed)
if th.cuda.is_available():
th.cuda.manual_seed_all(args.seed)
train(args)
import csv
import re
import torch as th
import numpy as np
import torch.nn as nn
import torch.optim as optim
from collections import OrderedDict
class MetricLogger(object):
def __init__(self, attr_names, parse_formats, save_path):
self._attr_format_dict = OrderedDict(zip(attr_names, parse_formats))
self._file = open(save_path, 'w')
self._csv = csv.writer(self._file)
self._csv.writerow(attr_names)
self._file.flush()
def log(self, **kwargs):
self._csv.writerow([parse_format % kwargs[attr_name]
for attr_name, parse_format in self._attr_format_dict.items()])
self._file.flush()
def close(self):
self._file.close()
def torch_total_param_num(net):
return sum([np.prod(p.shape) for p in net.parameters()])
def torch_net_info(net, save_path=None):
info_str = 'Total Param Number: {}\n'.format(torch_total_param_num(net)) +\
'Params:\n'
for k, v in net.named_parameters():
info_str += '\t{}: {}, {}\n'.format(k, v.shape, np.prod(v.shape))
info_str += str(net)
if save_path is not None:
with open(save_path, 'w') as f:
f.write(info_str)
return info_str
def get_activation(act):
"""Get the activation based on the act string
Parameters
----------
act: str or callable function
Returns
-------
ret: callable function
"""
if act is None:
return lambda x: x
if isinstance(act, str):
if act == 'leaky':
return nn.LeakyReLU(0.1)
elif act == 'relu':
return nn.ReLU()
elif act == 'tanh':
return nn.Tanh()
elif act == 'sigmoid':
return nn.Sigmoid()
elif act == 'softsign':
return nn.Softsign()
else:
raise NotImplementedError
else:
return act
def get_optimizer(opt):
if opt == 'sgd':
return optim.SGD
elif opt == 'adam':
return optim.Adam
else:
raise NotImplementedError
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment