data_utils.py 3.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import torch
import dgl
import numpy as np
import scipy.sparse as ssp

# This is the train-test split method most of the recommender system papers running on MovieLens
# takes.  It essentially follows the intuition of "training on the past and predict the future".
# One can also change the threshold to make validation and test set take larger proportions.
def train_test_split_by_time(g, column, etype, itype):
    n_edges = g.number_of_edges(etype)
    with g.local_scope():
        def splits(edges):
            num_edges, count = edges.data['train_mask'].shape

            # sort by timestamp
            _, sorted_idx = edges.data[column].sort(1)

            train_mask = edges.data['train_mask']
            val_mask = edges.data['val_mask']
            test_mask = edges.data['test_mask']

            x = torch.arange(num_edges)

            # If one user has more than one interactions, select the latest one for test.
            if count > 1:
                train_mask[x, sorted_idx[:, -1]] = False
                test_mask[x, sorted_idx[:, -1]] = True
            # If one user has more than two interactions, select the second latest one for validation.
            if count > 2:
                train_mask[x, sorted_idx[:, -2]] = False
                val_mask[x, sorted_idx[:, -2]] = True
            return {'train_mask': train_mask, 'val_mask': val_mask, 'test_mask': test_mask}

        g.edges[etype].data['train_mask'] = torch.ones(n_edges, dtype=torch.bool)
        g.edges[etype].data['val_mask'] = torch.zeros(n_edges, dtype=torch.bool)
        g.edges[etype].data['test_mask'] = torch.zeros(n_edges, dtype=torch.bool)
        g.nodes[itype].data['count'] = g.in_degrees(etype=etype)
        g.group_apply_edges('src', splits, etype=etype)

        train_indices = g.filter_edges(lambda edges: edges.data['train_mask'], etype=etype)
        val_indices = g.filter_edges(lambda edges: edges.data['val_mask'], etype=etype)
        test_indices = g.filter_edges(lambda edges: edges.data['test_mask'], etype=etype)

    return train_indices, val_indices, test_indices

def build_train_graph(g, train_indices, utype, itype, etype, etype_rev):
    train_g = g.edge_subgraph(
        {etype: train_indices, etype_rev: train_indices},
        preserve_nodes=True)
    # remove the induced node IDs - should be assigned by model instead
    del train_g.nodes[utype].data[dgl.NID]
    del train_g.nodes[itype].data[dgl.NID]

    # copy features
    for ntype in g.ntypes:
        for col, data in g.nodes[ntype].data.items():
            train_g.nodes[ntype].data[col] = data
    for etype in g.etypes:
        for col, data in g.edges[etype].data.items():
            train_g.edges[etype].data[col] = data[train_g.edges[etype].data[dgl.EID]]

    return train_g

def build_val_test_matrix(g, val_indices, test_indices, utype, itype, etype):
    n_users = g.number_of_nodes(utype)
    n_items = g.number_of_nodes(itype)
    val_src, val_dst = g.find_edges(val_indices, etype=etype)
    test_src, test_dst = g.find_edges(test_indices, etype=etype)
    val_src = val_src.numpy()
    val_dst = val_dst.numpy()
    test_src = test_src.numpy()
    test_dst = test_dst.numpy()
    val_matrix = ssp.coo_matrix((np.ones_like(val_src), (val_src, val_dst)), (n_users, n_items))
    test_matrix = ssp.coo_matrix((np.ones_like(test_src), (test_src, test_dst)), (n_users, n_items))

    return val_matrix, test_matrix

def linear_normalize(values):
    return (values - values.min(0, keepdims=True)) / \
        (values.max(0, keepdims=True) - values.min(0, keepdims=True))