data_utils.py 2.66 KB
Newer Older
1
import dask.dataframe as dd
Hongzhi (Steve), Chen's avatar
Hongzhi (Steve), Chen committed
2
3

import dgl
4
5
import numpy as np
import scipy.sparse as ssp
6
import torch
7
import tqdm
8

9
10
11
12

# This is the train-test split method most of the recommender system papers running on MovieLens
# takes.  It essentially follows the intuition of "training on the past and predict the future".
# One can also change the threshold to make validation and test set take larger proportions.
13
def train_test_split_by_time(df, timestamp, user):
14
15
16
    df["train_mask"] = np.ones((len(df),), dtype=np.bool)
    df["val_mask"] = np.zeros((len(df),), dtype=np.bool)
    df["test_mask"] = np.zeros((len(df),), dtype=np.bool)
17
    df = dd.from_pandas(df, npartitions=10)
18

19
20
21
22
23
24
25
26
27
    def train_test_split(df):
        df = df.sort_values([timestamp])
        if df.shape[0] > 1:
            df.iloc[-1, -3] = False
            df.iloc[-1, -1] = True
        if df.shape[0] > 2:
            df.iloc[-2, -3] = False
            df.iloc[-2, -2] = True
        return df
28
29
30
31
32
33
34

    df = (
        df.groupby(user, group_keys=False)
        .apply(train_test_split)
        .compute(scheduler="processes")
        .sort_index()
    )
35
    print(df[df[user] == df[user].unique()[0]].sort_values(timestamp))
36
37
38
39
40
41
    return (
        df["train_mask"].to_numpy().nonzero()[0],
        df["val_mask"].to_numpy().nonzero()[0],
        df["test_mask"].to_numpy().nonzero()[0],
    )

42
43
44

def build_train_graph(g, train_indices, utype, itype, etype, etype_rev):
    train_g = g.edge_subgraph(
45
46
        {etype: train_indices, etype_rev: train_indices}, relabel_nodes=False
    )
47
48
49
50
51
52
53

    # copy features
    for ntype in g.ntypes:
        for col, data in g.nodes[ntype].data.items():
            train_g.nodes[ntype].data[col] = data
    for etype in g.etypes:
        for col, data in g.edges[etype].data.items():
54
55
56
            train_g.edges[etype].data[col] = data[
                train_g.edges[etype].data[dgl.EID]
            ]
57
58
59

    return train_g

60

61
def build_val_test_matrix(g, val_indices, test_indices, utype, itype, etype):
62
63
    n_users = g.num_nodes(utype)
    n_items = g.num_nodes(itype)
64
65
66
67
68
69
    val_src, val_dst = g.find_edges(val_indices, etype=etype)
    test_src, test_dst = g.find_edges(test_indices, etype=etype)
    val_src = val_src.numpy()
    val_dst = val_dst.numpy()
    test_src = test_src.numpy()
    test_dst = test_dst.numpy()
70
71
72
73
74
75
    val_matrix = ssp.coo_matrix(
        (np.ones_like(val_src), (val_src, val_dst)), (n_users, n_items)
    )
    test_matrix = ssp.coo_matrix(
        (np.ones_like(test_src), (test_src, test_dst)), (n_users, n_items)
    )
76
77
78

    return val_matrix, test_matrix

79

80
def linear_normalize(values):
81
82
83
    return (values - values.min(0, keepdims=True)) / (
        values.max(0, keepdims=True) - values.min(0, keepdims=True)
    )