data.py 27.1 KB
Newer Older
Zihao Ye's avatar
Zihao Ye committed
1
2
3
"""MovieLens dataset"""
import os
import re
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
4
5
6

import dgl
import numpy as np
Zihao Ye's avatar
Zihao Ye committed
7
8
9
10
import pandas as pd
import scipy.sparse as sp
import torch as th
from dgl.data.utils import download, extract_archive, get_download_dir
11
from utils import to_etype_name
Zihao Ye's avatar
Zihao Ye committed
12
13

_urls = {
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
14
15
16
    "ml-100k": "http://files.grouplens.org/datasets/movielens/ml-100k.zip",
    "ml-1m": "http://files.grouplens.org/datasets/movielens/ml-1m.zip",
    "ml-10m": "http://files.grouplens.org/datasets/movielens/ml-10m.zip",
Zihao Ye's avatar
Zihao Ye committed
17
18
19
}

READ_DATASET_PATH = get_download_dir()
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
GENRES_ML_100K = [
    "unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
Zihao Ye's avatar
Zihao Ye committed
41
GENRES_ML_1M = GENRES_ML_100K[1:]
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
42
43
GENRES_ML_10M = GENRES_ML_100K + ["IMAX"]

Zihao Ye's avatar
Zihao Ye committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

class MovieLens(object):
    """MovieLens dataset used by GCMC model

    TODO(minjie): make this dataset more general

    The dataset stores MovieLens ratings in two types of graphs. The encoder graph
    contains rating value information in the form of edge types. The decoder graph
    stores plain user-movie pairs in the form of a bipartite graph with no rating
    information. All graphs have two types of nodes: "user" and "movie".

    The training, validation and test set can be summarized as follows:

    training_enc_graph : training user-movie pairs + rating info
    training_dec_graph : training user-movie pairs
    valid_enc_graph : training user-movie pairs + rating info
    valid_dec_graph : validation user-movie pairs
    test_enc_graph : training user-movie pairs + validation user-movie pairs + rating info
    test_dec_graph : test user-movie pairs

    Attributes
    ----------
peizhou001's avatar
peizhou001 committed
66
    train_enc_graph : dgl.DGLGraph
Zihao Ye's avatar
Zihao Ye committed
67
        Encoder graph for training.
peizhou001's avatar
peizhou001 committed
68
    train_dec_graph : dgl.DGLGraph
Zihao Ye's avatar
Zihao Ye committed
69
70
71
72
73
        Decoder graph for training.
    train_labels : torch.Tensor
        The categorical label of each user-movie pair
    train_truths : torch.Tensor
        The actual rating values of each user-movie pair
peizhou001's avatar
peizhou001 committed
74
    valid_enc_graph : dgl.DGLGraph
Zihao Ye's avatar
Zihao Ye committed
75
        Encoder graph for validation.
peizhou001's avatar
peizhou001 committed
76
    valid_dec_graph : dgl.DGLGraph
Zihao Ye's avatar
Zihao Ye committed
77
78
79
80
81
        Decoder graph for validation.
    valid_labels : torch.Tensor
        The categorical label of each user-movie pair
    valid_truths : torch.Tensor
        The actual rating values of each user-movie pair
peizhou001's avatar
peizhou001 committed
82
    test_enc_graph : dgl.DGLGraph
Zihao Ye's avatar
Zihao Ye committed
83
        Encoder graph for test.
peizhou001's avatar
peizhou001 committed
84
    test_dec_graph : dgl.DGLGraph
Zihao Ye's avatar
Zihao Ye committed
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
        Decoder graph for test.
    test_labels : torch.Tensor
        The categorical label of each user-movie pair
    test_truths : torch.Tensor
        The actual rating values of each user-movie pair
    user_feature : torch.Tensor
        User feature tensor. If None, representing an identity matrix.
    movie_feature : torch.Tensor
        Movie feature tensor. If None, representing an identity matrix.
    possible_rating_values : np.ndarray
        Available rating values in the dataset

    Parameters
    ----------
    name : str
        Dataset name. Could be "ml-100k", "ml-1m", "ml-10m"
    device : torch.device
        Device context
103
104
    mix_cpu_gpu : boo, optional
        If true, the ``user_feature`` attribute is stored in CPU
Zihao Ye's avatar
Zihao Ye committed
105
106
107
108
109
110
111
112
113
114
115
116
    use_one_hot_fea : bool, optional
        If true, the ``user_feature`` attribute is None, representing an one-hot identity
        matrix. (Default: False)
    symm : bool, optional
        If true, the use symmetric normalize constant. Otherwise, use left normalize
        constant. (Default: True)
    test_ratio : float, optional
        Ratio of test data
    valid_ratio : float, optional
        Ratio of validation data

    """
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
117
118
119
120
121
122
123
124
125
126
127

    def __init__(
        self,
        name,
        device,
        mix_cpu_gpu=False,
        use_one_hot_fea=False,
        symm=True,
        test_ratio=0.1,
        valid_ratio=0.1,
    ):
Zihao Ye's avatar
Zihao Ye committed
128
129
130
131
132
133
134
        self._name = name
        self._device = device
        self._symm = symm
        self._test_ratio = test_ratio
        self._valid_ratio = valid_ratio
        # download and extract
        download_dir = get_download_dir()
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
135
        zip_file_path = "{}/{}.zip".format(download_dir, name)
Zihao Ye's avatar
Zihao Ye committed
136
        download(_urls[name], path=zip_file_path)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
137
138
139
        extract_archive(zip_file_path, "{}/{}".format(download_dir, name))
        if name == "ml-10m":
            root_folder = "ml-10M100K"
Zihao Ye's avatar
Zihao Ye committed
140
141
142
143
144
145
        else:
            root_folder = name
        self._dir = os.path.join(download_dir, name, root_folder)
        print("Starting processing {} ...".format(self._name))
        self._load_raw_user_info()
        self._load_raw_movie_info()
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
        print("......")
        if self._name == "ml-100k":
            self.all_train_rating_info = self._load_raw_rates(
                os.path.join(self._dir, "u1.base"), "\t"
            )
            self.test_rating_info = self._load_raw_rates(
                os.path.join(self._dir, "u1.test"), "\t"
            )
            self.all_rating_info = pd.concat(
                [self.all_train_rating_info, self.test_rating_info]
            )
        elif self._name == "ml-1m" or self._name == "ml-10m":
            self.all_rating_info = self._load_raw_rates(
                os.path.join(self._dir, "ratings.dat"), "::"
            )
            num_test = int(
                np.ceil(self.all_rating_info.shape[0] * self._test_ratio)
            )
Zihao Ye's avatar
Zihao Ye committed
164
            shuffled_idx = np.random.permutation(self.all_rating_info.shape[0])
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
165
166
167
168
169
170
            self.test_rating_info = self.all_rating_info.iloc[
                shuffled_idx[:num_test]
            ]
            self.all_train_rating_info = self.all_rating_info.iloc[
                shuffled_idx[num_test:]
            ]
Zihao Ye's avatar
Zihao Ye committed
171
172
        else:
            raise NotImplementedError
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
        print("......")
        num_valid = int(
            np.ceil(self.all_train_rating_info.shape[0] * self._valid_ratio)
        )
        shuffled_idx = np.random.permutation(
            self.all_train_rating_info.shape[0]
        )
        self.valid_rating_info = self.all_train_rating_info.iloc[
            shuffled_idx[:num_valid]
        ]
        self.train_rating_info = self.all_train_rating_info.iloc[
            shuffled_idx[num_valid:]
        ]
        self.possible_rating_values = np.unique(
            self.train_rating_info["rating"].values
        )
Zihao Ye's avatar
Zihao Ye committed
189
190

        print("All rating pairs : {}".format(self.all_rating_info.shape[0]))
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
        print(
            "\tAll train rating pairs : {}".format(
                self.all_train_rating_info.shape[0]
            )
        )
        print(
            "\t\tTrain rating pairs : {}".format(
                self.train_rating_info.shape[0]
            )
        )
        print(
            "\t\tValid rating pairs : {}".format(
                self.valid_rating_info.shape[0]
            )
        )
        print(
            "\tTest rating pairs  : {}".format(self.test_rating_info.shape[0])
        )

        self.user_info = self._drop_unseen_nodes(
            orign_info=self.user_info,
            cmp_col_name="id",
            reserved_ids_set=set(self.all_rating_info["user_id"].values),
            label="user",
        )
        self.movie_info = self._drop_unseen_nodes(
            orign_info=self.movie_info,
            cmp_col_name="id",
            reserved_ids_set=set(self.all_rating_info["movie_id"].values),
            label="movie",
        )
Zihao Ye's avatar
Zihao Ye committed
222
223

        # Map user/movie to the global id
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
224
225
226
227
228
229
230
231
232
233
234
        self.global_user_id_map = {
            ele: i for i, ele in enumerate(self.user_info["id"])
        }
        self.global_movie_id_map = {
            ele: i for i, ele in enumerate(self.movie_info["id"])
        }
        print(
            "Total user number = {}, movie number = {}".format(
                len(self.global_user_id_map), len(self.global_movie_id_map)
            )
        )
Zihao Ye's avatar
Zihao Ye committed
235
236
237
238
239
240
241
242
        self._num_user = len(self.global_user_id_map)
        self._num_movie = len(self.global_movie_id_map)

        ### Generate features
        if use_one_hot_fea:
            self.user_feature = None
            self.movie_feature = None
        else:
243
244
245
246
247
            # if mix_cpu_gpu, we put features in CPU
            if mix_cpu_gpu:
                self.user_feature = th.FloatTensor(self._process_user_fea())
                self.movie_feature = th.FloatTensor(self._process_movie_fea())
            else:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
248
249
250
251
252
253
                self.user_feature = th.FloatTensor(self._process_user_fea()).to(
                    self._device
                )
                self.movie_feature = th.FloatTensor(
                    self._process_movie_fea()
                ).to(self._device)
Zihao Ye's avatar
Zihao Ye committed
254
255
256
257
258
259
260
261
262
263
264
        if self.user_feature is None:
            self.user_feature_shape = (self.num_user, self.num_user)
            self.movie_feature_shape = (self.num_movie, self.num_movie)
        else:
            self.user_feature_shape = self.user_feature.shape
            self.movie_feature_shape = self.movie_feature.shape
        info_line = "Feature dim: "
        info_line += "\nuser: {}".format(self.user_feature_shape)
        info_line += "\nmovie: {}".format(self.movie_feature_shape)
        print(info_line)

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
265
266
267
268
269
270
271
272
273
274
275
276
277
        (
            all_train_rating_pairs,
            all_train_rating_values,
        ) = self._generate_pair_value(self.all_train_rating_info)
        train_rating_pairs, train_rating_values = self._generate_pair_value(
            self.train_rating_info
        )
        valid_rating_pairs, valid_rating_values = self._generate_pair_value(
            self.valid_rating_info
        )
        test_rating_pairs, test_rating_values = self._generate_pair_value(
            self.test_rating_info
        )
Zihao Ye's avatar
Zihao Ye committed
278
279

        def _make_labels(ratings):
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
280
281
282
            labels = th.LongTensor(
                np.searchsorted(self.possible_rating_values, ratings)
            ).to(device)
Zihao Ye's avatar
Zihao Ye committed
283
284
            return labels

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
285
286
287
        self.train_enc_graph = self._generate_enc_graph(
            train_rating_pairs, train_rating_values, add_support=True
        )
Zihao Ye's avatar
Zihao Ye committed
288
289
290
291
292
293
294
295
296
        self.train_dec_graph = self._generate_dec_graph(train_rating_pairs)
        self.train_labels = _make_labels(train_rating_values)
        self.train_truths = th.FloatTensor(train_rating_values).to(device)

        self.valid_enc_graph = self.train_enc_graph
        self.valid_dec_graph = self._generate_dec_graph(valid_rating_pairs)
        self.valid_labels = _make_labels(valid_rating_values)
        self.valid_truths = th.FloatTensor(valid_rating_values).to(device)

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
297
298
299
        self.test_enc_graph = self._generate_enc_graph(
            all_train_rating_pairs, all_train_rating_values, add_support=True
        )
Zihao Ye's avatar
Zihao Ye committed
300
301
302
303
304
305
306
        self.test_dec_graph = self._generate_dec_graph(test_rating_pairs)
        self.test_labels = _make_labels(test_rating_values)
        self.test_truths = th.FloatTensor(test_rating_values).to(device)

        def _npairs(graph):
            rst = 0
            for r in self.possible_rating_values:
307
                r = to_etype_name(r)
308
                rst += graph.num_edges(str(r))
Zihao Ye's avatar
Zihao Ye committed
309
310
            return rst

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
311
312
        print(
            "Train enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
313
314
                self.train_enc_graph.num_nodes("user"),
                self.train_enc_graph.num_nodes("movie"),
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
315
316
317
318
319
                _npairs(self.train_enc_graph),
            )
        )
        print(
            "Train dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
320
321
322
                self.train_dec_graph.num_nodes("user"),
                self.train_dec_graph.num_nodes("movie"),
                self.train_dec_graph.num_edges(),
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
323
324
325
326
            )
        )
        print(
            "Valid enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
327
328
                self.valid_enc_graph.num_nodes("user"),
                self.valid_enc_graph.num_nodes("movie"),
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
329
330
331
332
333
                _npairs(self.valid_enc_graph),
            )
        )
        print(
            "Valid dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
334
335
336
                self.valid_dec_graph.num_nodes("user"),
                self.valid_dec_graph.num_nodes("movie"),
                self.valid_dec_graph.num_edges(),
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
337
338
339
340
            )
        )
        print(
            "Test enc graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
341
342
                self.test_enc_graph.num_nodes("user"),
                self.test_enc_graph.num_nodes("movie"),
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
343
344
345
346
347
                _npairs(self.test_enc_graph),
            )
        )
        print(
            "Test dec graph: \t#user:{}\t#movie:{}\t#pairs:{}".format(
348
349
350
                self.test_dec_graph.num_nodes("user"),
                self.test_dec_graph.num_nodes("movie"),
                self.test_dec_graph.num_edges(),
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
351
352
            )
        )
Zihao Ye's avatar
Zihao Ye committed
353
354

    def _generate_pair_value(self, rating_info):
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
        rating_pairs = (
            np.array(
                [
                    self.global_user_id_map[ele]
                    for ele in rating_info["user_id"]
                ],
                dtype=np.int64,
            ),
            np.array(
                [
                    self.global_movie_id_map[ele]
                    for ele in rating_info["movie_id"]
                ],
                dtype=np.int64,
            ),
        )
Zihao Ye's avatar
Zihao Ye committed
371
372
373
        rating_values = rating_info["rating"].values.astype(np.float32)
        return rating_pairs, rating_values

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
374
375
376
377
378
379
    def _generate_enc_graph(
        self, rating_pairs, rating_values, add_support=False
    ):
        user_movie_R = np.zeros(
            (self._num_user, self._num_movie), dtype=np.float32
        )
Zihao Ye's avatar
Zihao Ye committed
380
381
        user_movie_R[rating_pairs] = rating_values

382
        data_dict = dict()
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
383
        num_nodes_dict = {"user": self._num_user, "movie": self._num_movie}
Zihao Ye's avatar
Zihao Ye committed
384
385
386
387
388
        rating_row, rating_col = rating_pairs
        for rating in self.possible_rating_values:
            ridx = np.where(rating_values == rating)
            rrow = rating_row[ridx]
            rcol = rating_col[ridx]
389
            rating = to_etype_name(rating)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
390
391
392
393
394
395
            data_dict.update(
                {
                    ("user", str(rating), "movie"): (rrow, rcol),
                    ("movie", "rev-%s" % str(rating), "user"): (rcol, rrow),
                }
            )
396
        graph = dgl.heterograph(data_dict, num_nodes_dict=num_nodes_dict)
Zihao Ye's avatar
Zihao Ye committed
397
398

        # sanity check
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
399
400
        assert (
            len(rating_pairs[0])
401
            == sum([graph.num_edges(et) for et in graph.etypes]) // 2
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
402
        )
Zihao Ye's avatar
Zihao Ye committed
403
404

        if add_support:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
405

Zihao Ye's avatar
Zihao Ye committed
406
            def _calc_norm(x):
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
407
408
409
                x = x.numpy().astype("float32")
                x[x == 0.0] = np.inf
                x = th.FloatTensor(1.0 / np.sqrt(x))
410
                return x.unsqueeze(1)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
411

Zihao Ye's avatar
Zihao Ye committed
412
413
414
415
416
            user_ci = []
            user_cj = []
            movie_ci = []
            movie_cj = []
            for r in self.possible_rating_values:
417
                r = to_etype_name(r)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
418
                user_ci.append(graph["rev-%s" % r].in_degrees())
Zihao Ye's avatar
Zihao Ye committed
419
420
421
                movie_ci.append(graph[r].in_degrees())
                if self._symm:
                    user_cj.append(graph[r].out_degrees())
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
422
                    movie_cj.append(graph["rev-%s" % r].out_degrees())
Zihao Ye's avatar
Zihao Ye committed
423
424
425
426
427
428
429
430
431
                else:
                    user_cj.append(th.zeros((self.num_user,)))
                    movie_cj.append(th.zeros((self.num_movie,)))
            user_ci = _calc_norm(sum(user_ci))
            movie_ci = _calc_norm(sum(movie_ci))
            if self._symm:
                user_cj = _calc_norm(sum(user_cj))
                movie_cj = _calc_norm(sum(movie_cj))
            else:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
432
433
434
435
436
437
438
439
                user_cj = th.ones(
                    self.num_user,
                )
                movie_cj = th.ones(
                    self.num_movie,
                )
            graph.nodes["user"].data.update({"ci": user_ci, "cj": user_cj})
            graph.nodes["movie"].data.update({"ci": movie_ci, "cj": movie_cj})
Zihao Ye's avatar
Zihao Ye committed
440
441
442
443
444
445
446

        return graph

    def _generate_dec_graph(self, rating_pairs):
        ones = np.ones_like(rating_pairs[0])
        user_movie_ratings_coo = sp.coo_matrix(
            (ones, rating_pairs),
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
447
448
449
450
451
452
453
454
455
456
            shape=(self.num_user, self.num_movie),
            dtype=np.float32,
        )
        g = dgl.bipartite_from_scipy(
            user_movie_ratings_coo, utype="_U", etype="_E", vtype="_V"
        )
        return dgl.heterograph(
            {("user", "rate", "movie"): g.edges()},
            num_nodes_dict={"user": self.num_user, "movie": self.num_movie},
        )
Zihao Ye's avatar
Zihao Ye committed
457
458
459
460
461
462
463
464
465
466
467
468
469

    @property
    def num_links(self):
        return self.possible_rating_values.size

    @property
    def num_user(self):
        return self._num_user

    @property
    def num_movie(self):
        return self._num_movie

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
470
471
472
    def _drop_unseen_nodes(
        self, orign_info, cmp_col_name, reserved_ids_set, label
    ):
Zihao Ye's avatar
Zihao Ye committed
473
474
475
476
        # print("  -----------------")
        # print("{}: {}(reserved) v.s. {}(from info)".format(label, len(reserved_ids_set),
        #                                                      len(set(orign_info[cmp_col_name].values))))
        if reserved_ids_set != set(orign_info[cmp_col_name].values):
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
477
478
479
            pd_rating_ids = pd.DataFrame(
                list(reserved_ids_set), columns=["id_graph"]
            )
Zihao Ye's avatar
Zihao Ye committed
480
            # print("\torign_info: ({}, {})".format(orign_info.shape[0], orign_info.shape[1]))
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
481
482
483
484
485
486
487
            data_info = orign_info.merge(
                pd_rating_ids,
                left_on=cmp_col_name,
                right_on="id_graph",
                how="outer",
            )
            data_info = data_info.dropna(subset=[cmp_col_name, "id_graph"])
Zihao Ye's avatar
Zihao Ye committed
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
            data_info = data_info.drop(columns=["id_graph"])
            data_info = data_info.reset_index(drop=True)
            # print("\tAfter dropping, data shape: ({}, {})".format(data_info.shape[0], data_info.shape[1]))
            return data_info
        else:
            orign_info = orign_info.reset_index(drop=True)
            return orign_info

    def _load_raw_rates(self, file_path, sep):
        """In MovieLens, the rates have the following format

        ml-100k
        user id \t movie id \t rating \t timestamp

        ml-1m/10m
        UserID::MovieID::Rating::Timestamp

        timestamp is unix timestamp and can be converted by pd.to_datetime(X, unit='s')

        Parameters
        ----------
        file_path : str

        Returns
        -------
        rating_info : pd.DataFrame
        """
        rating_info = pd.read_csv(
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
516
517
518
519
520
521
522
523
524
525
526
527
            file_path,
            sep=sep,
            header=None,
            names=["user_id", "movie_id", "rating", "timestamp"],
            dtype={
                "user_id": np.int32,
                "movie_id": np.int32,
                "ratings": np.float32,
                "timestamp": np.int64,
            },
            engine="python",
        )
Zihao Ye's avatar
Zihao Ye committed
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
        return rating_info

    def _load_raw_user_info(self):
        """In MovieLens, the user attributes file have the following formats:

        ml-100k:
        user id | age | gender | occupation | zip code

        ml-1m:
        UserID::Gender::Age::Occupation::Zip-code

        For ml-10m, there is no user information. We read the user id from the rating file.

        Parameters
        ----------
        name : str

        Returns
        -------
        user_info : pd.DataFrame
        """
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
        if self._name == "ml-100k":
            self.user_info = pd.read_csv(
                os.path.join(self._dir, "u.user"),
                sep="|",
                header=None,
                names=["id", "age", "gender", "occupation", "zip_code"],
                engine="python",
            )
        elif self._name == "ml-1m":
            self.user_info = pd.read_csv(
                os.path.join(self._dir, "users.dat"),
                sep="::",
                header=None,
                names=["id", "gender", "age", "occupation", "zip_code"],
                engine="python",
            )
        elif self._name == "ml-10m":
Zihao Ye's avatar
Zihao Ye committed
566
            rating_info = pd.read_csv(
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
                os.path.join(self._dir, "ratings.dat"),
                sep="::",
                header=None,
                names=["user_id", "movie_id", "rating", "timestamp"],
                dtype={
                    "user_id": np.int32,
                    "movie_id": np.int32,
                    "ratings": np.float32,
                    "timestamp": np.int64,
                },
                engine="python",
            )
            self.user_info = pd.DataFrame(
                np.unique(rating_info["user_id"].values.astype(np.int32)),
                columns=["id"],
            )
Zihao Ye's avatar
Zihao Ye committed
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
        else:
            raise NotImplementedError

    def _process_user_fea(self):
        """

        Parameters
        ----------
        user_info : pd.DataFrame
        name : str
        For ml-100k and ml-1m, the column name is ['id', 'gender', 'age', 'occupation', 'zip_code'].
            We take the age, gender, and the one-hot encoding of the occupation as the user features.
        For ml-10m, there is no user feature and we set the feature to be a single zero.

        Returns
        -------
        user_features : np.ndarray

        """
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
602
603
604
605
        if self._name == "ml-100k" or self._name == "ml-1m":
            ages = self.user_info["age"].values.astype(np.float32)
            gender = (self.user_info["gender"] == "F").values.astype(np.float32)
            all_occupations = set(self.user_info["occupation"])
Zihao Ye's avatar
Zihao Ye committed
606
            occupation_map = {ele: i for i, ele in enumerate(all_occupations)}
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
            occupation_one_hot = np.zeros(
                shape=(self.user_info.shape[0], len(all_occupations)),
                dtype=np.float32,
            )
            occupation_one_hot[
                np.arange(self.user_info.shape[0]),
                np.array(
                    [
                        occupation_map[ele]
                        for ele in self.user_info["occupation"]
                    ]
                ),
            ] = 1
            user_features = np.concatenate(
                [
                    ages.reshape((self.user_info.shape[0], 1)) / 50.0,
                    gender.reshape((self.user_info.shape[0], 1)),
                    occupation_one_hot,
                ],
                axis=1,
            )
        elif self._name == "ml-10m":
            user_features = np.zeros(
                shape=(self.user_info.shape[0], 1), dtype=np.float32
            )
Zihao Ye's avatar
Zihao Ye committed
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
        else:
            raise NotImplementedError
        return user_features

    def _load_raw_movie_info(self):
        """In MovieLens, the movie attributes may have the following formats:

        In ml_100k:

        movie id | movie title | release date | video release date | IMDb URL | [genres]

        In ml_1m, ml_10m:

        MovieID::Title (Release Year)::Genres

        Also, Genres are separated by |, e.g., Adventure|Animation|Children|Comedy|Fantasy

        Parameters
        ----------
        name : str

        Returns
        -------
        movie_info : pd.DataFrame
            For ml-100k, the column name is ['id', 'title', 'release_date', 'video_release_date', 'url'] + [GENRES (19)]]
            For ml-1m and ml-10m, the column name is ['id', 'title'] + [GENRES (18/20)]]
        """
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
659
        if self._name == "ml-100k":
Zihao Ye's avatar
Zihao Ye committed
660
            GENRES = GENRES_ML_100K
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
661
        elif self._name == "ml-1m":
Zihao Ye's avatar
Zihao Ye committed
662
            GENRES = GENRES_ML_1M
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
663
        elif self._name == "ml-10m":
Zihao Ye's avatar
Zihao Ye committed
664
665
666
667
            GENRES = GENRES_ML_10M
        else:
            raise NotImplementedError

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
        if self._name == "ml-100k":
            file_path = os.path.join(self._dir, "u.item")
            self.movie_info = pd.read_csv(
                file_path,
                sep="|",
                header=None,
                names=[
                    "id",
                    "title",
                    "release_date",
                    "video_release_date",
                    "url",
                ]
                + GENRES,
                encoding="iso-8859-1",
            )
        elif self._name == "ml-1m" or self._name == "ml-10m":
            file_path = os.path.join(self._dir, "movies.dat")
            movie_info = pd.read_csv(
                file_path,
                sep="::",
                header=None,
                names=["id", "title", "genres"],
                encoding="iso-8859-1",
            )
Zihao Ye's avatar
Zihao Ye committed
693
            genre_map = {ele: i for i, ele in enumerate(GENRES)}
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
694
695
696
697
698
699
700
            genre_map["Children's"] = genre_map["Children"]
            genre_map["Childrens"] = genre_map["Children"]
            movie_genres = np.zeros(
                shape=(movie_info.shape[0], len(GENRES)), dtype=np.float32
            )
            for i, genres in enumerate(movie_info["genres"]):
                for ele in genres.split("|"):
Zihao Ye's avatar
Zihao Ye committed
701
702
703
                    if ele in genre_map:
                        movie_genres[i, genre_map[ele]] = 1.0
                    else:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
704
705
706
707
708
709
                        print(
                            "genres not found, filled with unknown: {}".format(
                                genres
                            )
                        )
                        movie_genres[i, genre_map["unknown"]] = 1.0
Zihao Ye's avatar
Zihao Ye committed
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
            for idx, genre_name in enumerate(GENRES):
                assert idx == genre_map[genre_name]
                movie_info[genre_name] = movie_genres[:, idx]
            self.movie_info = movie_info.drop(columns=["genres"])
        else:
            raise NotImplementedError

    def _process_movie_fea(self):
        """

        Parameters
        ----------
        movie_info : pd.DataFrame
        name :  str

        Returns
        -------
        movie_features : np.ndarray
            Generate movie features by concatenating embedding and the year

        """
731
        import torchtext
Chang Liu's avatar
Chang Liu committed
732
        from torchtext.data.utils import get_tokenizer
733

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
734
        if self._name == "ml-100k":
Zihao Ye's avatar
Zihao Ye committed
735
            GENRES = GENRES_ML_100K
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
736
        elif self._name == "ml-1m":
Zihao Ye's avatar
Zihao Ye committed
737
            GENRES = GENRES_ML_1M
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
738
        elif self._name == "ml-10m":
Zihao Ye's avatar
Zihao Ye committed
739
740
741
742
            GENRES = GENRES_ML_10M
        else:
            raise NotImplementedError

Chang Liu's avatar
Chang Liu committed
743
744
        # Old torchtext-legacy API commented below
        # TEXT = torchtext.legacy.data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm')
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
745
746
747
748
749
750
751
752
753
754
755
756
757
        tokenizer = get_tokenizer(
            "spacy", language="en_core_web_sm"
        )  # new API (torchtext 0.9+)
        embedding = torchtext.vocab.GloVe(name="840B", dim=300)

        title_embedding = np.zeros(
            shape=(self.movie_info.shape[0], 300), dtype=np.float32
        )
        release_years = np.zeros(
            shape=(self.movie_info.shape[0], 1), dtype=np.float32
        )
        p = re.compile(r"(.+)\s*\((\d+)\)")
        for i, title in enumerate(self.movie_info["title"]):
Zihao Ye's avatar
Zihao Ye committed
758
759
            match_res = p.match(title)
            if match_res is None:
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
760
761
762
763
764
                print(
                    "{} cannot be matched, index={}, name={}".format(
                        title, i, self._name
                    )
                )
Zihao Ye's avatar
Zihao Ye committed
765
766
767
768
                title_context, year = title, 1950
            else:
                title_context, year = match_res.groups()
            # We use average of glove
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
769
770
771
772
773
774
            # Upgraded torchtext API:  TEXT.tokenize(title_context) --> tokenizer(title_context)
            title_embedding[i, :] = (
                embedding.get_vecs_by_tokens(tokenizer(title_context))
                .numpy()
                .mean(axis=0)
            )
Zihao Ye's avatar
Zihao Ye committed
775
            release_years[i] = float(year)
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
776
777
778
779
780
781
782
783
        movie_features = np.concatenate(
            (
                title_embedding,
                (release_years - 1950.0) / 100.0,
                self.movie_info[GENRES],
            ),
            axis=1,
        )
Zihao Ye's avatar
Zihao Ye committed
784
785
        return movie_features

Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
786
787
788

if __name__ == "__main__":
    MovieLens("ml-100k", device=th.device("cpu"), symm=True)