"""MovieLens dataset""" import numpy as np import os import re import pandas as pd import scipy.sparse as sp import gluonnlp as nlp import mxnet as mx import dgl from dgl.data.utils import download, extract_archive, get_download_dir _urls = { 'ml-100k' : 'http://files.grouplens.org/datasets/movielens/ml-100k.zip', 'ml-1m' : 'http://files.grouplens.org/datasets/movielens/ml-1m.zip', 'ml-10m' : 'http://files.grouplens.org/datasets/movielens/ml-10m.zip', } READ_DATASET_PATH = get_download_dir() GENRES_ML_100K =\ ['unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] GENRES_ML_1M = GENRES_ML_100K[1:] GENRES_ML_10M = GENRES_ML_100K + ['IMAX'] class MovieLens(object): """MovieLens dataset used by GCMC model TODO(minjie): make this dataset more general The dataset stores MovieLens ratings in two types of graphs. The encoder graph contains rating value information in the form of edge types. The decoder graph stores plain user-movie pairs in the form of a bipartite graph with no rating information. All graphs have two types of nodes: "user" and "movie". The training, validation and test set can be summarized as follows: training_enc_graph : training user-movie pairs + rating info training_dec_graph : training user-movie pairs valid_enc_graph : training user-movie pairs + rating info valid_dec_graph : validation user-movie pairs test_enc_graph : training user-movie pairs + validation user-movie pairs + rating info test_dec_graph : test user-movie pairs Attributes ---------- train_enc_graph : dgl.DGLHeteroGraph Encoder graph for training. train_dec_graph : dgl.DGLHeteroGraph Decoder graph for training. train_labels : mx.nd.NDArray The categorical label of each user-movie pair train_truths : mx.nd.NDArray The actual rating values of each user-movie pair valid_enc_graph : dgl.DGLHeteroGraph Encoder graph for validation. valid_dec_graph : dgl.DGLHeteroGraph Decoder graph for validation. valid_labels : mx.nd.NDArray The categorical label of each user-movie pair valid_truths : mx.nd.NDArray The actual rating values of each user-movie pair test_enc_graph : dgl.DGLHeteroGraph Encoder graph for test. test_dec_graph : dgl.DGLHeteroGraph Decoder graph for test. test_labels : mx.nd.NDArray The categorical label of each user-movie pair test_truths : mx.nd.NDArray The actual rating values of each user-movie pair user_feature : mx.nd.NDArray User feature tensor. If None, representing an identity matrix. movie_feature : mx.nd.NDArray Movie feature tensor. If None, representing an identity matrix. possible_rating_values : np.ndarray Available rating values in the dataset Parameters ---------- name : str Dataset name. Could be "ml-100k", "ml-1m", "ml-10m" ctx : mx.context.Context Device context use_one_hot_fea : bool, optional If true, the ``user_feature`` attribute is None, representing an one-hot identity matrix. (Default: False) symm : bool, optional If true, the use symmetric normalize constant. Otherwise, use left normalize constant. (Default: True) test_ratio : float, optional Ratio of test data valid_ratio : float, optional Ratio of validation data """ def __init__(self, name, ctx, use_one_hot_fea=False, symm=True, test_ratio=0.1, valid_ratio=0.1): self._name = name self._ctx = ctx self._symm = symm self._test_ratio = test_ratio self._valid_ratio = valid_ratio # download and extract download_dir = get_download_dir() zip_file_path = '{}/{}.zip'.format(download_dir, name) download(_urls[name], path=zip_file_path) extract_archive(zip_file_path, '{}/{}'.format(download_dir, name)) if name == 'ml-10m': root_folder = 'ml-10M100K' else: root_folder = name self._dir = os.path.join(download_dir, name, root_folder) print("Starting processing {} ...".format(self._name)) self._load_raw_user_info() self._load_raw_movie_info() print('......') if self._name == 'ml-100k': self.all_train_rating_info = self._load_raw_rates(os.path.join(self._dir, 'u1.base'), '\t') self.test_rating_info = self._load_raw_rates(os.path.join(self._dir, 'u1.test'), '\t') self.all_rating_info = pd.concat([self.all_train_rating_info, self.test_rating_info]) elif self._name == 'ml-1m' or self._name == 'ml-10m': self.all_rating_info = self._load_raw_rates(os.path.join(self._dir, 'ratings.dat'), '::') num_test = int(np.ceil(self.all_rating_info.shape[0] * self._test_ratio)) shuffled_idx = np.random.permutation(self.all_rating_info.shape[0]) self.test_rating_info = self.all_rating_info.iloc[shuffled_idx[: num_test]] self.all_train_rating_info = self.all_rating_info.iloc[shuffled_idx[num_test: ]] else: raise NotImplementedError print('......') num_valid = int(np.ceil(self.all_train_rating_info.shape[0] * self._valid_ratio)) shuffled_idx = np.random.permutation(self.all_train_rating_info.shape[0]) self.valid_rating_info = self.all_train_rating_info.iloc[shuffled_idx[: num_valid]] self.train_rating_info = self.all_train_rating_info.iloc[shuffled_idx[num_valid: ]] self.possible_rating_values = np.unique(self.train_rating_info["rating"].values) print("All rating pairs : {}".format(self.all_rating_info.shape[0])) print("\tAll train rating pairs : {}".format(self.all_train_rating_info.shape[0])) print("\t\tTrain rating pairs : {}".format(self.train_rating_info.shape[0])) print("\t\tValid rating pairs : {}".format(self.valid_rating_info.shape[0])) print("\tTest rating pairs : {}".format(self.test_rating_info.shape[0])) self.user_info = self._drop_unseen_nodes(orign_info=self.user_info, cmp_col_name="id", reserved_ids_set=set(self.all_rating_info["user_id"].values), label="user") self.movie_info = self._drop_unseen_nodes(orign_info=self.movie_info, cmp_col_name="id", reserved_ids_set=set(self.all_rating_info["movie_id"].values), label="movie") # Map user/movie to the global id self.global_user_id_map = {ele: i for i, ele in enumerate(self.user_info['id'])} self.global_movie_id_map = {ele: i for i, ele in enumerate(self.movie_info['id'])} print('Total user number = {}, movie number = {}'.format(len(self.global_user_id_map), len(self.global_movie_id_map))) self._num_user = len(self.global_user_id_map) self._num_movie = len(self.global_movie_id_map) ### Generate features if use_one_hot_fea: self.user_feature = None self.movie_feature = None else: self.user_feature = mx.nd.array(self._process_user_fea(), ctx=ctx, dtype=np.float32) self.movie_feature = mx.nd.array(self._process_movie_fea(), ctx=ctx, dtype=np.float32) if self.user_feature is None: self.user_feature_shape = (self.num_user, self.num_user) self.movie_feature_shape = (self.num_movie, self.num_movie) else: self.user_feature_shape = self.user_feature.shape self.movie_feature_shape = self.movie_feature.shape info_line = "Feature dim: " info_line += "\nuser: {}".format(self.user_feature_shape) info_line += "\nmovie: {}".format(self.movie_feature_shape) print(info_line) all_train_rating_pairs, all_train_rating_values = self._generate_pair_value(self.all_train_rating_info) train_rating_pairs, train_rating_values = self._generate_pair_value(self.train_rating_info) valid_rating_pairs, valid_rating_values = self._generate_pair_value(self.valid_rating_info) test_rating_pairs, test_rating_values = self._generate_pair_value(self.test_rating_info) def _make_labels(ratings): labels = mx.nd.array(np.searchsorted(self.possible_rating_values, ratings), ctx=ctx, dtype=np.int32) return labels self.train_enc_graph = self._generate_enc_graph(train_rating_pairs, train_rating_values, add_support=True) self.train_dec_graph = self._generate_dec_graph(train_rating_pairs) self.train_labels = _make_labels(train_rating_values) self.train_truths = mx.nd.array(train_rating_values, ctx=ctx, dtype=np.float32) self.valid_enc_graph = self.train_enc_graph self.valid_dec_graph = self._generate_dec_graph(valid_rating_pairs) self.valid_labels = _make_labels(valid_rating_values) self.valid_truths = mx.nd.array(valid_rating_values, ctx=ctx, dtype=np.float32) self.test_enc_graph = self._generate_enc_graph(all_train_rating_pairs, all_train_rating_values, add_support=True) self.test_dec_graph = self._generate_dec_graph(test_rating_pairs) self.test_labels = _make_labels(test_rating_values) self.test_truths = mx.nd.array(test_rating_values, ctx=ctx, dtype=np.float32) def _npairs(graph): rst = 0 for r in self.possible_rating_values: rst += graph.number_of_edges(str(r)) return rst print("Train enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( self.train_enc_graph.number_of_nodes('user'), self.train_enc_graph.number_of_nodes('movie'), _npairs(self.train_enc_graph))) print("Train dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( self.train_dec_graph.number_of_nodes('user'), self.train_dec_graph.number_of_nodes('movie'), self.train_dec_graph.number_of_edges())) print("Valid enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( self.valid_enc_graph.number_of_nodes('user'), self.valid_enc_graph.number_of_nodes('movie'), _npairs(self.valid_enc_graph))) print("Valid dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( self.valid_dec_graph.number_of_nodes('user'), self.valid_dec_graph.number_of_nodes('movie'), self.valid_dec_graph.number_of_edges())) print("Test enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( self.test_enc_graph.number_of_nodes('user'), self.test_enc_graph.number_of_nodes('movie'), _npairs(self.test_enc_graph))) print("Test dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format( self.test_dec_graph.number_of_nodes('user'), self.test_dec_graph.number_of_nodes('movie'), self.test_dec_graph.number_of_edges())) def _generate_pair_value(self, rating_info): rating_pairs = (np.array([self.global_user_id_map[ele] for ele in rating_info["user_id"]], dtype=np.int64), np.array([self.global_movie_id_map[ele] for ele in rating_info["movie_id"]], dtype=np.int64)) rating_values = rating_info["rating"].values.astype(np.float32) return rating_pairs, rating_values def _generate_enc_graph(self, rating_pairs, rating_values, add_support=False): user_movie_R = np.zeros((self._num_user, self._num_movie), dtype=np.float32) user_movie_R[rating_pairs] = rating_values movie_user_R = user_movie_R.transpose() rating_graphs = [] rating_row, rating_col = rating_pairs for rating in self.possible_rating_values: ridx = np.where(rating_values == rating) rrow = rating_row[ridx] rcol = rating_col[ridx] bg = dgl.bipartite((rrow, rcol), 'user', str(rating), 'movie', num_nodes=(self._num_user, self._num_movie)) rev_bg = dgl.bipartite((rcol, rrow), 'movie', 'rev-%s' % str(rating), 'user', num_nodes=(self._num_movie, self._num_user)) rating_graphs.append(bg) rating_graphs.append(rev_bg) graph = dgl.hetero_from_relations(rating_graphs) # sanity check assert len(rating_pairs[0]) == sum([graph.number_of_edges(et) for et in graph.etypes]) // 2 if add_support: def _calc_norm(x): x = x.asnumpy().astype('float32') x[x == 0.] = np.inf x = mx.nd.array(1. / np.sqrt(x)) return x.expand_dims(1) user_ci = [] user_cj = [] movie_ci = [] movie_cj = [] for r in self.possible_rating_values: r = str(r) user_ci.append(graph['rev-%s' % r].in_degrees()) movie_ci.append(graph[r].in_degrees()) if self._symm: user_cj.append(graph[r].out_degrees()) movie_cj.append(graph['rev-%s' % r].out_degrees()) else: user_cj.append(mx.nd.zeros((self.num_user,))) movie_cj.append(mx.nd.zeros((self.num_movie,))) user_ci = _calc_norm(mx.nd.add_n(*user_ci)) movie_ci = _calc_norm(mx.nd.add_n(*movie_ci)) if self._symm: user_cj = _calc_norm(mx.nd.add_n(*user_cj)) movie_cj = _calc_norm(mx.nd.add_n(*movie_cj)) else: user_cj = mx.nd.ones((self.num_user,)) movie_cj = mx.nd.ones((self.num_movie,)) graph.nodes['user'].data.update({'ci' : user_ci, 'cj' : user_cj}) graph.nodes['movie'].data.update({'ci' : movie_ci, 'cj' : movie_cj}) return graph def _generate_dec_graph(self, rating_pairs): ones = np.ones_like(rating_pairs[0]) user_movie_ratings_coo = sp.coo_matrix( (ones, rating_pairs), shape=(self.num_user, self.num_movie), dtype=np.float32) return dgl.bipartite(user_movie_ratings_coo, 'user', 'rate', 'movie') @property def num_links(self): return self.possible_rating_values.size @property def num_user(self): return self._num_user @property def num_movie(self): return self._num_movie def _drop_unseen_nodes(self, orign_info, cmp_col_name, reserved_ids_set, label): # print(" -----------------") # print("{}: {}(reserved) v.s. {}(from info)".format(label, len(reserved_ids_set), # len(set(orign_info[cmp_col_name].values)))) if reserved_ids_set != set(orign_info[cmp_col_name].values): pd_rating_ids = pd.DataFrame(list(reserved_ids_set), columns=["id_graph"]) # print("\torign_info: ({}, {})".format(orign_info.shape[0], orign_info.shape[1])) data_info = orign_info.merge(pd_rating_ids, left_on=cmp_col_name, right_on='id_graph', how='outer') data_info = data_info.dropna(subset=[cmp_col_name, 'id_graph']) data_info = data_info.drop(columns=["id_graph"]) data_info = data_info.reset_index(drop=True) # print("\tAfter dropping, data shape: ({}, {})".format(data_info.shape[0], data_info.shape[1])) return data_info else: orign_info = orign_info.reset_index(drop=True) return orign_info def _load_raw_rates(self, file_path, sep): """In MovieLens, the rates have the following format ml-100k user id \t movie id \t rating \t timestamp ml-1m/10m UserID::MovieID::Rating::Timestamp timestamp is unix timestamp and can be converted by pd.to_datetime(X, unit='s') Parameters ---------- file_path : str Returns ------- rating_info : pd.DataFrame """ rating_info = pd.read_csv( file_path, sep=sep, header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], dtype={'user_id': np.int32, 'movie_id' : np.int32, 'ratings': np.float32, 'timestamp': np.int64}, engine='python') return rating_info def _load_raw_user_info(self): """In MovieLens, the user attributes file have the following formats: ml-100k: user id | age | gender | occupation | zip code ml-1m: UserID::Gender::Age::Occupation::Zip-code For ml-10m, there is no user information. We read the user id from the rating file. Parameters ---------- name : str Returns ------- user_info : pd.DataFrame """ if self._name == 'ml-100k': self.user_info = pd.read_csv(os.path.join(self._dir, 'u.user'), sep='|', header=None, names=['id', 'age', 'gender', 'occupation', 'zip_code'], engine='python') elif self._name == 'ml-1m': self.user_info = pd.read_csv(os.path.join(self._dir, 'users.dat'), sep='::', header=None, names=['id', 'gender', 'age', 'occupation', 'zip_code'], engine='python') elif self._name == 'ml-10m': rating_info = pd.read_csv( os.path.join(self._dir, 'ratings.dat'), sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], dtype={'user_id': np.int32, 'movie_id': np.int32, 'ratings': np.float32, 'timestamp': np.int64}, engine='python') self.user_info = pd.DataFrame(np.unique(rating_info['user_id'].values.astype(np.int32)), columns=['id']) else: raise NotImplementedError def _process_user_fea(self): """ Parameters ---------- user_info : pd.DataFrame name : str For ml-100k and ml-1m, the column name is ['id', 'gender', 'age', 'occupation', 'zip_code']. We take the age, gender, and the one-hot encoding of the occupation as the user features. For ml-10m, there is no user feature and we set the feature to be a single zero. Returns ------- user_features : np.ndarray """ if self._name == 'ml-100k' or self._name == 'ml-1m': ages = self.user_info['age'].values.astype(np.float32) gender = (self.user_info['gender'] == 'F').values.astype(np.float32) all_occupations = set(self.user_info['occupation']) occupation_map = {ele: i for i, ele in enumerate(all_occupations)} occupation_one_hot = np.zeros(shape=(self.user_info.shape[0], len(all_occupations)), dtype=np.float32) occupation_one_hot[np.arange(self.user_info.shape[0]), np.array([occupation_map[ele] for ele in self.user_info['occupation']])] = 1 user_features = np.concatenate([ages.reshape((self.user_info.shape[0], 1)) / 50.0, gender.reshape((self.user_info.shape[0], 1)), occupation_one_hot], axis=1) elif self._name == 'ml-10m': user_features = np.zeros(shape=(self.user_info.shape[0], 1), dtype=np.float32) else: raise NotImplementedError return user_features def _load_raw_movie_info(self): """In MovieLens, the movie attributes may have the following formats: In ml_100k: movie id | movie title | release date | video release date | IMDb URL | [genres] In ml_1m, ml_10m: MovieID::Title (Release Year)::Genres Also, Genres are separated by |, e.g., Adventure|Animation|Children|Comedy|Fantasy Parameters ---------- name : str Returns ------- movie_info : pd.DataFrame For ml-100k, the column name is ['id', 'title', 'release_date', 'video_release_date', 'url'] + [GENRES (19)]] For ml-1m and ml-10m, the column name is ['id', 'title'] + [GENRES (18/20)]] """ if self._name == 'ml-100k': GENRES = GENRES_ML_100K elif self._name == 'ml-1m': GENRES = GENRES_ML_1M elif self._name == 'ml-10m': GENRES = GENRES_ML_10M else: raise NotImplementedError if self._name == 'ml-100k': file_path = os.path.join(self._dir, 'u.item') self.movie_info = pd.read_csv(file_path, sep='|', header=None, names=['id', 'title', 'release_date', 'video_release_date', 'url'] + GENRES, engine='python') elif self._name == 'ml-1m' or self._name == 'ml-10m': file_path = os.path.join(self._dir, 'movies.dat') movie_info = pd.read_csv(file_path, sep='::', header=None, names=['id', 'title', 'genres'], engine='python') genre_map = {ele: i for i, ele in enumerate(GENRES)} genre_map['Children\'s'] = genre_map['Children'] genre_map['Childrens'] = genre_map['Children'] movie_genres = np.zeros(shape=(movie_info.shape[0], len(GENRES)), dtype=np.float32) for i, genres in enumerate(movie_info['genres']): for ele in genres.split('|'): if ele in genre_map: movie_genres[i, genre_map[ele]] = 1.0 else: print('genres not found, filled with unknown: {}'.format(genres)) movie_genres[i, genre_map['unknown']] = 1.0 for idx, genre_name in enumerate(GENRES): assert idx == genre_map[genre_name] movie_info[genre_name] = movie_genres[:, idx] self.movie_info = movie_info.drop(columns=["genres"]) else: raise NotImplementedError def _process_movie_fea(self): """ Parameters ---------- movie_info : pd.DataFrame name : str Returns ------- movie_features : np.ndarray Generate movie features by concatenating embedding and the year """ if self._name == 'ml-100k': GENRES = GENRES_ML_100K elif self._name == 'ml-1m': GENRES = GENRES_ML_1M elif self._name == 'ml-10m': GENRES = GENRES_ML_10M else: raise NotImplementedError word_embedding = nlp.embedding.GloVe('glove.840B.300d') tokenizer = nlp.data.transforms.SpacyTokenizer() title_embedding = np.zeros(shape=(self.movie_info.shape[0], 300), dtype=np.float32) release_years = np.zeros(shape=(self.movie_info.shape[0], 1), dtype=np.float32) p = re.compile(r'(.+)\s*\((\d+)\)') for i, title in enumerate(self.movie_info['title']): match_res = p.match(title) if match_res is None: print('{} cannot be matched, index={}, name={}'.format(title, i, self._name)) title_context, year = title, 1950 else: title_context, year = match_res.groups() # We use average of glove title_embedding[i, :] = word_embedding[tokenizer(title_context)].asnumpy().mean(axis=0) release_years[i] = float(year) movie_features = np.concatenate((title_embedding, (release_years - 1950.0) / 100.0, self.movie_info[GENRES]), axis=1) return movie_features if __name__ == '__main__': MovieLens("ml-100k", ctx=mx.cpu(), symm=True)