Unverified Commit 9737b270 authored by Minjie Wang's avatar Minjie Wang Committed by GitHub
Browse files

[Model][Hetero] GCMC using new hetero APIs (#860)

* init

* rm data

* README

* fix rating values that are not decimal

* rm stale codes

* small fix

* upd

* rewrite decoder

* fix many

* many fix; performance matched

* upd; handle sparse input

* upd

* address comments

* more docstring; download data automatically

* shared param mode
parent e3921d5d
# Graph Convolutional Matrix Completion
Paper link: [https://arxiv.org/abs/1706.02263](https://arxiv.org/abs/1706.02263)
Author's code: [https://github.com/riannevdberg/gc-mc](https://github.com/riannevdberg/gc-mc)
The implementation does not handle side-channel features and mini-epoching and thus achieves
slightly worse performance when using node features.
Credit: Jiani Zhang ([@jennyzhang0215](https://github.com/jennyzhang0215))
## Dependencies
* MXNet 1.5.0+
* pandas
* gluonnlp
## Data
Supported datasets: ml-100k, ml-1m, ml-10m
## How to run
ml-100k, no feature
```bash
DGLBACKEND=mxnet python train.py --data_name=ml-100k --use_one_hot_fea --gcn_agg_accum=stack
```
Results: RMSE=0.9077 (0.910 reported)
Speed: 0.0246s/epoch (vanilla implementation: 0.1008s/epoch)
ml-100k, with feature
```bash
DGLBACKEND=mxnet python train.py --data_name=ml-100k --gcn_agg_accum=stack
```
Results: RMSE=0.9495 (0.905 reported)
ml-1m, no feature
```bash
DGLBACKEND=mxnet python train.py --data_name=ml-1m --gcn_agg_accum=sum --use_one_hot_fea
```
Results: RMSE=0.8377 (0.832 reported)
Speed: 0.0695s/epoch (vanilla implementation: 1.538s/epoch)
ml-10m, no feature
```bash
DGLBACKEND=mxnet python train.py --data_name=ml-10m --gcn_agg_accum=stack --gcn_dropout=0.3 \
--train_lr=0.001 --train_min_lr=0.0001 --train_max_iter=15000 \
--use_one_hot_fea --gen_r_num_basis_func=4
```
Results: RMSE=0.7875 (0.777 reported)
Speed: 0.6480s/epoch (vanilla implementation: OOM)
Testbed: EC2 p3.2xlarge
This diff is collapsed.
"""NN modules"""
import math
import numpy as np
import mxnet as mx
import mxnet.ndarray as F
from mxnet.gluon import nn, Block
import dgl.function as fn
from utils import get_activation
class GCMCLayer(Block):
r"""GCMC layer
.. math::
z_j^{(l+1)} = \sigma_{agg}\left[\mathrm{agg}\left(
\sum_{j\in\mathcal{N}_1}\frac{1}{c_{ij}}W_1h_j, \ldots,
\sum_{j\in\mathcal{N}_R}\frac{1}{c_{ij}}W_Rh_j
\right)\right]
After that, apply an extra output projection:
.. math::
h_j^{(l+1)} = \sigma_{out}W_oz_j^{(l+1)}
The equation is applied to both user nodes and movie nodes and the parameters
are not shared unless ``share_user_item_param`` is true.
Parameters
----------
rating_vals : list of int or float
Possible rating values.
user_in_units : int
Size of user input feature
movie_in_units : int
Size of movie input feature
msg_units : int
Size of message :math:`W_rh_j`
out_units : int
Size of of final output user and movie features
dropout_rate : float, optional
Dropout rate (Default: 0.0)
agg : str, optional
Function to aggregate messages of different ratings.
Could be any of the supported cross type reducers:
"sum", "max", "min", "mean", "stack".
(Default: "stack")
agg_act : callable, str, optional
Activation function :math:`sigma_{agg}`. (Default: None)
out_act : callable, str, optional
Activation function :math:`sigma_{agg}`. (Default: None)
share_user_item_param : bool, optional
If true, user node and movie node share the same set of parameters.
Require ``user_in_units`` and ``move_in_units`` to be the same.
(Default: False)
"""
def __init__(self,
rating_vals,
user_in_units,
movie_in_units,
msg_units,
out_units,
dropout_rate=0.0,
agg='stack', # or 'sum'
agg_act=None,
out_act=None,
share_user_item_param=False):
super(GCMCLayer, self).__init__()
self.rating_vals = rating_vals
self.agg = agg
self.share_user_item_param = share_user_item_param
if agg == 'stack':
# divide the original msg unit size by number of ratings to keep
# the dimensionality
assert msg_units % len(rating_vals) == 0
msg_units = msg_units // len(rating_vals)
with self.name_scope():
self.dropout = nn.Dropout(dropout_rate)
self.W_r = {}
for rating in rating_vals:
rating = str(rating)
if share_user_item_param and user_in_units == movie_in_units:
self.W_r[rating] = self.params.get(
'W_r_%s' % rating, shape=(user_in_units, msg_units),
dtype=np.float32, allow_deferred_init=True)
self.W_r['rev-%s' % rating] = self.W_r[rating]
else:
self.W_r[rating] = self.params.get(
'W_r_%s' % rating, shape=(user_in_units, msg_units),
dtype=np.float32, allow_deferred_init=True)
self.W_r['rev-%s' % rating] = self.params.get(
'revW_r_%s' % rating, shape=(movie_in_units, msg_units),
dtype=np.float32, allow_deferred_init=True)
self.ufc = nn.Dense(out_units)
if share_user_item_param:
self.ifc = self.ufc
else:
self.ifc = nn.Dense(out_units)
self.agg_act = get_activation(agg_act)
self.out_act = get_activation(out_act)
def forward(self, graph, ufeat=None, ifeat=None):
"""Forward function
Normalizer constant :math:`c_{ij}` is stored as two node data "ci"
and "cj".
Parameters
----------
graph : DGLHeteroGraph
User-movie rating graph. It should contain two node types: "user"
and "movie" and many edge types each for one rating value.
ufeat : mx.nd.NDArray, optional
User features. If None, using an identity matrix.
ifeat : mx.nd.NDArray, optional
Movie features. If None, using an identity matrix.
Returns
-------
new_ufeat : mx.nd.NDArray
New user features
new_ifeat : mx.nd.NDArray
New movie features
"""
num_u = graph.number_of_nodes('user')
num_i = graph.number_of_nodes('movie')
funcs = {}
for i, rating in enumerate(self.rating_vals):
rating = str(rating)
# W_r * x
x_u = dot_or_identity(ufeat, self.W_r[rating].data())
x_i = dot_or_identity(ifeat, self.W_r['rev-%s' % rating].data())
# left norm and dropout
x_u = x_u * self.dropout(graph.nodes['user'].data['cj'])
x_i = x_i * self.dropout(graph.nodes['movie'].data['cj'])
graph.nodes['user'].data['h%d' % i] = x_u
graph.nodes['movie'].data['h%d' % i] = x_i
funcs[rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h'))
funcs['rev-%s' % rating] = (fn.copy_u('h%d' % i, 'm'), fn.sum('m', 'h'))
# message passing
graph.multi_update_all(funcs, self.agg)
ufeat = graph.nodes['user'].data.pop('h').reshape((num_u, -1))
ifeat = graph.nodes['movie'].data.pop('h').reshape((num_i, -1))
# right norm
ufeat = ufeat * graph.nodes['user'].data['ci']
ifeat = ifeat * graph.nodes['movie'].data['ci']
# fc and non-linear
ufeat = self.agg_act(ufeat)
ifeat = self.agg_act(ifeat)
ufeat = self.dropout(ufeat)
ifeat = self.dropout(ifeat)
ufeat = self.ufc(ufeat)
ifeat = self.ifc(ifeat)
return self.out_act(ufeat), self.out_act(ifeat)
class BiDecoder(Block):
r"""Bilinear decoder.
.. math::
p(M_{ij}=r) = \text{softmax}(u_i^TQ_rv_j)
The trainable parameter :math:`Q_r` is further decomposed to a linear
combination of basis weight matrices :math:`P_s`:
.. math::
Q_r = \sum_{s=1}^{b} a_{rs}P_s
Parameters
----------
rating_vals : list of int or float
Possible rating values.
in_units : int
Size of input user and movie features
num_basis_functions : int, optional
Number of basis. (Default: 2)
dropout_rate : float, optional
Dropout raite (Default: 0.0)
"""
def __init__(self,
rating_vals,
in_units,
num_basis_functions=2,
dropout_rate=0.0):
super(BiDecoder, self).__init__()
self.rating_vals = rating_vals
self._num_basis_functions = num_basis_functions
self.dropout = nn.Dropout(dropout_rate)
self.Ps = []
with self.name_scope():
for i in range(num_basis_functions):
self.Ps.append(self.params.get(
'Ps_%d' % i, shape=(in_units, in_units),
#init=mx.initializer.Orthogonal(scale=1.1, rand_type='normal'),
init=mx.initializer.Xavier(magnitude=math.sqrt(2.0)),
allow_deferred_init=True))
self.rate_out = nn.Dense(units=len(rating_vals), flatten=False, use_bias=False)
def forward(self, graph, ufeat, ifeat):
"""Forward function.
Parameters
----------
graph : DGLHeteroGraph
"Flattened" user-movie graph with only one edge type.
ufeat : mx.nd.NDArray
User embeddings. Shape: (|V_u|, D)
ifeat : mx.nd.NDArray
Movie embeddings. Shape: (|V_m|, D)
Returns
-------
mx.nd.NDArray
Predicting scores for each user-movie edge.
"""
graph = graph.local_var()
ufeat = self.dropout(ufeat)
ifeat = self.dropout(ifeat)
graph.nodes['movie'].data['h'] = ifeat
basis_out = []
for i in range(self._num_basis_functions):
graph.nodes['user'].data['h'] = F.dot(ufeat, self.Ps[i].data())
graph.apply_edges(fn.u_dot_v('h', 'h', 'sr'))
basis_out.append(graph.edata['sr'].expand_dims(1))
out = F.concat(*basis_out, dim=1)
out = self.rate_out(out)
return out
def dot_or_identity(A, B):
# if A is None, treat as identity matrix
if A is None:
return B
else:
return mx.nd.dot(A, B)
"""Training script"""
import os, time
import argparse
import logging
import random
import string
import numpy as np
import mxnet as mx
from mxnet import gluon
from data import MovieLens
from model import GCMCLayer, BiDecoder
from utils import get_activation, parse_ctx, gluon_net_info, gluon_total_param_num, \
params_clip_global_norm, MetricLogger
from mxnet.gluon import Block
class Net(Block):
def __init__(self, args, **kwargs):
super(Net, self).__init__(**kwargs)
self._act = get_activation(args.model_activation)
with self.name_scope():
self.encoder = GCMCLayer(args.rating_vals,
args.src_in_units,
args.dst_in_units,
args.gcn_agg_units,
args.gcn_out_units,
args.gcn_dropout,
args.gcn_agg_accum,
agg_act=self._act,
share_user_item_param=args.share_param)
self.decoder = BiDecoder(args.rating_vals,
in_units=args.gcn_out_units,
num_basis_functions=args.gen_r_num_basis_func)
def forward(self, enc_graph, dec_graph, ufeat, ifeat):
user_out, movie_out = self.encoder(
enc_graph,
ufeat,
ifeat)
pred_ratings = self.decoder(dec_graph, user_out, movie_out)
return pred_ratings
def evaluate(args, net, dataset, segment='valid'):
possible_rating_values = dataset.possible_rating_values
nd_possible_rating_values = mx.nd.array(possible_rating_values, ctx=args.ctx, dtype=np.float32)
if segment == "valid":
rating_values = dataset.valid_truths
enc_graph = dataset.valid_enc_graph
dec_graph = dataset.valid_dec_graph
elif segment == "test":
rating_values = dataset.test_truths
enc_graph = dataset.test_enc_graph
dec_graph = dataset.test_dec_graph
else:
raise NotImplementedError
# Evaluate RMSE
with mx.autograd.predict_mode():
pred_ratings = net(enc_graph, dec_graph,
dataset.user_feature, dataset.movie_feature)
real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
nd_possible_rating_values.reshape((1, -1))).sum(axis=1)
rmse = mx.nd.square(real_pred_ratings - rating_values).mean().asscalar()
rmse = np.sqrt(rmse)
return rmse
def train(args):
print(args)
dataset = MovieLens(args.data_name, args.ctx, use_one_hot_fea=args.use_one_hot_fea, symm=args.gcn_agg_norm_symm,
test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
print("Loading data finished ...\n")
args.src_in_units = dataset.user_feature_shape[1]
args.dst_in_units = dataset.movie_feature_shape[1]
args.rating_vals = dataset.possible_rating_values
### build the net
net = Net(args=args)
net.initialize(init=mx.init.Xavier(factor_type='in'), ctx=args.ctx)
net.hybridize()
nd_possible_rating_values = mx.nd.array(dataset.possible_rating_values, ctx=args.ctx, dtype=np.float32)
rating_loss_net = gluon.loss.SoftmaxCELoss()
rating_loss_net.hybridize()
trainer = gluon.Trainer(net.collect_params(), args.train_optimizer, {'learning_rate': args.train_lr})
print("Loading network finished ...\n")
### perpare training data
train_gt_labels = dataset.train_labels
train_gt_ratings = dataset.train_truths
### prepare the logger
train_loss_logger = MetricLogger(['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
os.path.join(args.save_dir, 'valid_loss%d.csv' % args.save_id))
test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
os.path.join(args.save_dir, 'test_loss%d.csv' % args.save_id))
### declare the loss information
best_valid_rmse = np.inf
no_better_valid = 0
best_iter = -1
avg_gnorm = 0
count_rmse = 0
count_num = 0
count_loss = 0
print("Start training ...")
dur = []
for iter_idx in range(1, args.train_max_iter):
if iter_idx > 3:
t0 = time.time()
with mx.autograd.record():
pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph,
dataset.user_feature, dataset.movie_feature)
loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
loss.backward()
count_loss += loss.asscalar()
gnorm = params_clip_global_norm(net.collect_params(), args.train_grad_clip, args.ctx)
avg_gnorm += gnorm
trainer.step(1.0)
if iter_idx > 3:
dur.append(time.time() - t0)
if iter_idx == 1:
print("Total #Param of net: %d" % (gluon_total_param_num(net)))
print(gluon_net_info(net, save_path=os.path.join(args.save_dir, 'net%d.txt' % args.save_id)))
real_pred_ratings = (mx.nd.softmax(pred_ratings, axis=1) *
nd_possible_rating_values.reshape((1, -1))).sum(axis=1)
rmse = mx.nd.square(real_pred_ratings - train_gt_ratings).sum()
count_rmse += rmse.asscalar()
count_num += pred_ratings.shape[0]
if iter_idx % args.train_log_interval == 0:
train_loss_logger.log(iter=iter_idx,
loss=count_loss/(iter_idx+1), rmse=count_rmse/count_num)
logging_str = "Iter={}, gnorm={:.3f}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
iter_idx, avg_gnorm/args.train_log_interval,
count_loss/iter_idx, count_rmse/count_num,
np.average(dur))
avg_gnorm = 0
count_rmse = 0
count_num = 0
if iter_idx % args.train_valid_interval == 0:
valid_rmse = evaluate(args=args, net=net, dataset=dataset, segment='valid')
valid_loss_logger.log(iter = iter_idx, rmse = valid_rmse)
logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)
if valid_rmse < best_valid_rmse:
best_valid_rmse = valid_rmse
no_better_valid = 0
best_iter = iter_idx
#net.save_parameters(filename=os.path.join(args.save_dir, 'best_valid_net{}.params'.format(args.save_id)))
test_rmse = evaluate(args=args, net=net, dataset=dataset, segment='test')
best_test_rmse = test_rmse
test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
else:
no_better_valid += 1
if no_better_valid > args.train_early_stopping_patience\
and trainer.learning_rate <= args.train_min_lr:
logging.info("Early stopping threshold reached. Stop training.")
break
if no_better_valid > args.train_decay_patience:
new_lr = max(trainer.learning_rate * args.train_lr_decay_factor, args.train_min_lr)
if new_lr < trainer.learning_rate:
logging.info("\tChange the LR to %g" % new_lr)
trainer.set_learning_rate(new_lr)
no_better_valid = 0
if iter_idx % args.train_log_interval == 0:
print(logging_str)
print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.format(
best_iter, best_valid_rmse, best_test_rmse))
train_loss_logger.close()
valid_loss_logger.close()
test_loss_logger.close()
def config():
parser = argparse.ArgumentParser(description='Run the baseline method.')
parser.add_argument('--seed', default=123, type=int)
parser.add_argument('--ctx', dest='ctx', default='gpu0', type=str,
help='Running Context. E.g `--ctx gpu` or `--ctx gpu0,gpu1` or `--ctx cpu`')
parser.add_argument('--save_dir', type=str, help='The saving directory')
parser.add_argument('--save_id', type=int, help='The saving log id')
parser.add_argument('--silent', action='store_true')
parser.add_argument('--data_name', default='ml-1m', type=str,
help='The dataset name: ml-100k, ml-1m, ml-10m')
parser.add_argument('--data_test_ratio', type=float, default=0.1) ## for ml-100k the test ration is 0.2
parser.add_argument('--data_valid_ratio', type=float, default=0.1)
parser.add_argument('--use_one_hot_fea', action='store_true', default=False)
#parser.add_argument('--model_remove_rating', type=bool, default=False)
parser.add_argument('--model_activation', type=str, default="leaky")
parser.add_argument('--gcn_dropout', type=float, default=0.7)
parser.add_argument('--gcn_agg_norm_symm', type=bool, default=True)
parser.add_argument('--gcn_agg_units', type=int, default=500)
parser.add_argument('--gcn_agg_accum', type=str, default="sum")
parser.add_argument('--gcn_out_units', type=int, default=75)
parser.add_argument('--gen_r_num_basis_func', type=int, default=2)
# parser.add_argument('--train_rating_batch_size', type=int, default=10000)
parser.add_argument('--train_max_iter', type=int, default=2000)
parser.add_argument('--train_log_interval', type=int, default=1)
parser.add_argument('--train_valid_interval', type=int, default=1)
parser.add_argument('--train_optimizer', type=str, default="adam")
parser.add_argument('--train_grad_clip', type=float, default=1.0)
parser.add_argument('--train_lr', type=float, default=0.01)
parser.add_argument('--train_min_lr', type=float, default=0.001)
parser.add_argument('--train_lr_decay_factor', type=float, default=0.5)
parser.add_argument('--train_decay_patience', type=int, default=50)
parser.add_argument('--train_early_stopping_patience', type=int, default=100)
parser.add_argument('--share_param', default=False, action='store_true')
args = parser.parse_args()
args.ctx = parse_ctx(args.ctx)[0]
### configure save_fir to save all the info
if args.save_dir is None:
args.save_dir = args.data_name+"_" + ''.join(random.choices(string.ascii_uppercase + string.digits, k=2))
if args.save_id is None:
args.save_id = np.random.randint(20)
args.save_dir = os.path.join("log", args.save_dir)
if not os.path.isdir(args.save_dir):
os.makedirs(args.save_dir)
return args
if __name__ == '__main__':
args = config()
np.random.seed(args.seed)
mx.random.seed(args.seed, args.ctx)
train(args)
import ast
import os
import csv
import inspect
import logging
import re
import mxnet.ndarray as nd
from mxnet import gluon
from mxnet.gluon import nn
import mxnet as mx
import numpy as np
from collections import OrderedDict
class MetricLogger(object):
def __init__(self, attr_names, parse_formats, save_path):
self._attr_format_dict = OrderedDict(zip(attr_names, parse_formats))
self._file = open(save_path, 'w')
self._csv = csv.writer(self._file)
self._csv.writerow(attr_names)
self._file.flush()
def log(self, **kwargs):
self._csv.writerow([parse_format % kwargs[attr_name]
for attr_name, parse_format in self._attr_format_dict.items()])
self._file.flush()
def close(self):
self._file.close()
def parse_ctx(ctx_args):
ctx = re.findall('([a-z]+)(\d*)', ctx_args)
ctx = [(device, int(num)) if len(num) > 0 else (device, 0) for device, num in ctx]
ctx = [mx.Context(*ele) for ele in ctx]
return ctx
def gluon_total_param_num(net):
return sum([np.prod(v.shape) for v in net.collect_params().values()])
def gluon_net_info(net, save_path=None):
info_str = 'Total Param Number: {}\n'.format(gluon_total_param_num(net)) +\
'Params:\n'
for k, v in net.collect_params().items():
info_str += '\t{}: {}, {}\n'.format(k, v.shape, np.prod(v.shape))
info_str += str(net)
if save_path is not None:
with open(save_path, 'w') as f:
f.write(info_str)
return info_str
def params_clip_global_norm(param_dict, clip, ctx):
grads = [p.grad(ctx) for p in param_dict.values()]
gnorm = gluon.utils.clip_global_norm(grads, clip)
return gnorm
def get_activation(act):
"""Get the activation based on the act string
Parameters
----------
act: str or HybridBlock
Returns
-------
ret: HybridBlock
"""
if act is None:
return lambda x: x
if isinstance(act, str):
if act == 'leaky':
return nn.LeakyReLU(0.1)
elif act in ['relu', 'sigmoid', 'tanh', 'softrelu', 'softsign']:
return nn.Activation(act)
else:
raise NotImplementedError
else:
return act
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment