# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from collections import defaultdict from glob import glob import pandas as pd from scipy import sparse import scipy.sparse as sp import numpy as np from scipy.sparse import load_npz, csr_matrix import logging import json LOG = logging.getLogger("VAE") def save_as_npz(m_sp, path): if not os.path.isdir(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) sp.save_npz(path, m_sp) def get_count(tp, id): playcount_groupbyid = tp[[id]].groupby(id, as_index=True) count = playcount_groupbyid.size() return count def filter_triplets(tp, min_uc=5, min_sc=0): # Only keep the triplets for items which were clicked on by at least min_sc users. if min_sc > 0: itemcount = get_count(tp, 'movieId') tp = tp[tp['movieId'].isin(itemcount.index[itemcount >= min_sc])] # Only keep the triplets for users who clicked on at least min_uc items # After doing this, some of the items will have less than min_uc users, but should only be a small proportion if min_uc > 0: usercount = get_count(tp, 'userId') tp = tp[tp['userId'].isin(usercount.index[usercount >= min_uc])] # Update both usercount and itemcount after filtering usercount, itemcount = get_count(tp, 'userId'), get_count(tp, 'movieId') return tp, usercount, itemcount def save_id_mappings(cache_dir, show2id, profile2id): if not os.path.isdir(cache_dir): os.makedirs(cache_dir) for d, filename in [(show2id, 'show2id.json'), (profile2id, 'profile2id.json')]: with open(os.path.join(cache_dir, filename), 'w') as f: d = {str(k): v for k, v in d.items()} json.dump(d, f, indent=4) def load_and_parse_ML_20M(data_dir, threshold=4, parse=True): """ Original way of processing ml-20m dataset from VAE for CF paper Copyright [2018] [Dawen Liang, Rahul G. Krishnan, Matthew D. Hoffman, and Tony Jebara] SPDX-License-Identifier: Apache-2.0 Modifications copyright (C) 2019 Michał Filipiuk, Albert Cieślak, Frederic Grabowski, Radosław Rowicki """ cache_dir = os.path.join(data_dir, "ml-20m/preprocessed") train_data_file = os.path.join(cache_dir, "train_data.npz") vad_data_true_file = os.path.join(cache_dir, "vad_data_true.npz") vad_data_test_file = os.path.join(cache_dir, "vad_data_test.npz") test_data_true_file = os.path.join(cache_dir, "test_data_true.npz") test_data_test_file = os.path.join(cache_dir, "test_data_test.npz") if (os.path.isfile(train_data_file) and os.path.isfile(vad_data_true_file) and os.path.isfile(vad_data_test_file) and os.path.isfile(test_data_true_file) and os.path.isfile(test_data_test_file)): LOG.info("Already processed, skipping.") return load_npz(train_data_file), \ load_npz(vad_data_true_file), \ load_npz(vad_data_test_file), \ load_npz(test_data_true_file), \ load_npz(test_data_test_file), if not parse: raise ValueError('Dataset not preprocessed. Please run python3 prepare_dataset.py first.') LOG.info("Parsing movielens.") source_file = os.path.join(data_dir, "ml-20m/extracted/ml-20m", "ratings.csv") if not glob(source_file): raise ValueError('Dataset not downloaded. Please download the ML-20m dataset from https://grouplens.org/datasets/movielens/20m/, unzip it and put it in ', source_file) raw_data = pd.read_csv(source_file) raw_data.drop('timestamp', axis=1, inplace=True) raw_data = raw_data[raw_data['rating'] >= threshold] raw_data, user_activity, item_popularity = filter_triplets(raw_data) unique_uid = user_activity.index idx_perm = np.random.permutation(unique_uid.size) unique_uid = unique_uid[idx_perm] n_users = unique_uid.size n_heldout_users = 10000 true_users = unique_uid[:(n_users - n_heldout_users * 2)] vd_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)] test_users = unique_uid[(n_users - n_heldout_users):] train_plays = raw_data.loc[raw_data['userId'].isin(true_users)] unique_sid = pd.unique(train_plays['movieId']) show2id = dict((sid, i) for (i, sid) in enumerate(unique_sid)) profile2id = dict((pid, i) for (i, pid) in enumerate(unique_uid)) save_id_mappings(cache_dir, show2id, profile2id) def split_train_test_proportion(data, test_prop=0.2): data_grouped_by_user = data.groupby('userId') true_list, test_list = list(), list() for i, (_, group) in enumerate(data_grouped_by_user): n_items_u = len(group) if n_items_u >= 5: idx = np.zeros(n_items_u, dtype='bool') idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True true_list.append(group[np.logical_not(idx)]) test_list.append(group[idx]) else: true_list.append(group) data_true = pd.concat(true_list) data_test = pd.concat(test_list) return data_true, data_test vad_plays = raw_data.loc[raw_data['userId'].isin(vd_users)] vad_plays = vad_plays.loc[vad_plays['movieId'].isin(unique_sid)] vad_plays_true, vad_plays_test = split_train_test_proportion(vad_plays) test_plays = raw_data.loc[raw_data['userId'].isin(test_users)] test_plays = test_plays.loc[test_plays['movieId'].isin(unique_sid)] test_plays_true, test_plays_test = split_train_test_proportion(test_plays) def numerize(tp): uid = tp['userId'].map(lambda x: profile2id[x]) sid = tp['movieId'].map(lambda x: show2id[x]) return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid']) train_data = numerize(train_plays) vad_data_true = numerize(vad_plays_true) vad_data_test = numerize(vad_plays_test) test_data_true = numerize(test_plays_true) test_data_test = numerize(test_plays_test) n_items = len(unique_sid) def load_train_data(tp): n_users = tp['uid'].max() + 1 rows, cols = tp['uid'], tp['sid'] data = sparse.csr_matrix((np.ones_like(rows), (rows, cols)), dtype='float64', shape=(n_users, n_items)) return data train_data = load_train_data(train_data) def load_true_test_data(tp_true, tp_test): start_idx = min(tp_true['uid'].min(), tp_test['uid'].min()) end_idx = max(tp_true['uid'].max(), tp_test['uid'].max()) rows_true, cols_true = tp_true['uid'] - start_idx, tp_true['sid'] rows_test, cols_test = tp_test['uid'] - start_idx, tp_test['sid'] data_true = sparse.csr_matrix((np.ones_like(rows_true), (rows_true, cols_true)), dtype='float64', shape=(end_idx - start_idx + 1, n_items)) data_test = sparse.csr_matrix((np.ones_like(rows_test), (rows_test, cols_test)), dtype='float64', shape=(end_idx - start_idx + 1, n_items)) return data_true, data_test vad_data_true, vad_data_test = load_true_test_data(vad_data_true, vad_data_test) test_data_true, test_data_test = load_true_test_data(test_data_true, test_data_test) save_as_npz(train_data, train_data_file) save_as_npz(vad_data_true, vad_data_true_file) save_as_npz(vad_data_test, vad_data_test_file) save_as_npz(test_data_true, test_data_true_file) save_as_npz(test_data_test, test_data_test_file) return train_data, vad_data_true, vad_data_test, test_data_true, test_data_test def filter_data(data, min_users=1, min_items=5): """ :param data: input matrix :param min_users: only keep items, that were clicked by at least min_users :param min_items: only keep users, that clicked at least min_items :return: filtered matrix """ col_count = defaultdict(lambda: 0) for col in data.nonzero()[1]: col_count[col] += 1 filtered_col = [k for k, v in col_count.items() if v >= min_users] filtered_data_c = data[:, filtered_col] del data row_count = defaultdict(lambda: 0) for row in filtered_data_c.nonzero()[0]: row_count[row] += 1 filtered_row = [k for k, v in row_count.items() if v >= min_items] filtered_data_r = filtered_data_c[filtered_row, :] del filtered_data_c return filtered_data_r def split_into_train_val_test(data, val_ratio, test_ratio): """ :param data: input matrix :param val_ratio: Ratio of validation users to all users :param test_ratio: Ratio of test users to all users :return: Tuple of 3 matrices : {train_matrix, val_matrix, test_matrix} """ assert val_ratio + test_ratio < 1 train_ratio = 1 - val_ratio - test_ratio rows_count = data.shape[0] idx = np.random.permutation(range(rows_count)) train_users_count = int(np.rint(rows_count * train_ratio)) val_users_count = int(np.rint(rows_count * val_ratio)) seperator = train_users_count + val_users_count train_matrix = data[idx[:train_users_count]] val_matrix = data[idx[train_users_count:seperator]] test_matrix = data[idx[seperator:]] return train_matrix, val_matrix, test_matrix def split_movies_into_train_test(data, train_ratio): """ Splits data into 2 matrices. The users stay the same, but the items are being split by train_ratio :param data: input matrix :param train_ratio: Ratio of input items to all items :return: tuple of 2 matrices: {train_matrix, test_matrix} """ rows_count, columns_count = data.shape train_rows = list() train_columns = list() test_rows = list() test_columns = list() for i in range(rows_count): user_movies = data.getrow(i).nonzero()[1] np.random.shuffle(user_movies) movies_count = len(user_movies) train_count = int(np.floor(movies_count * train_ratio)) test_count = movies_count - train_count train_movies = user_movies[:train_count] test_movies = user_movies[train_count:] train_rows += ([i] * train_count) train_columns += list(train_movies) test_rows += ([i] * test_count) test_columns += list(test_movies) train_matrix = csr_matrix(([1] * len(train_rows), (train_rows, train_columns)), shape=(rows_count, columns_count)) test_matrix = csr_matrix(([1] * len(test_rows), (test_rows, test_columns)), shape=(rows_count, columns_count)) return train_matrix, test_matrix def remove_items_that_doesnt_occure_in_train(train_matrix, val_matrix, test_matrix): """ Remove items that don't occure in train matrix :param train_matrix: training data :param val_matrix: validation data :param test_matrix: test data :return: Input matrices without some items """ item_occure = defaultdict(lambda: False) for col in train_matrix.nonzero()[1]: item_occure[col] = True non_empty_items = [k for k, v in item_occure.items() if v == True] return train_matrix[:, non_empty_items], val_matrix[:, non_empty_items], test_matrix[:, non_empty_items]