[Model] fix PinSAGE example for 0.5 release (#2072)

28deee4d · Quan (Andy) Gan · GitHub · dba8c825 · 28deee4d · 28deee4d
Unverified Commit 28deee4d authored Aug 20, 2020 by Quan (Andy) Gan Committed by GitHub Aug 20, 2020
5 changed files
--- a/examples/pytorch/pinsage/builder.py
+++ b/examples/pytorch/pinsage/builder.py
@@ -106,7 +106,7 @@ class PandasGraphBuilder(object):
        dsttype = self.entity_pk_to_name[destination_key]
        etype = (srctype, name, dsttype)
        self.relation_name_to_etype[name] = etype
-        self.edges_per_relation[etype] = (src.cat.codes.values, dst.cat.codes.values)
+        self.edges_per_relation[etype] = (src.cat.codes.values.astype('int64'), dst.cat.codes.values.astype('int64'))
        self.relation_tables[name] = relation_table
        self.relation_src_key[name] = source_key
        self.relation_dst_key[name] = destination_key

--- a/examples/pytorch/pinsage/data_utils.py
+++ b/examples/pytorch/pinsage/data_utils.py
@@ -8,7 +8,7 @@ import dask.dataframe as dd
 # This is the train-test split method most of the recommender system papers running on MovieLens
 # takes.  It essentially follows the intuition of "training on the past and predict the future".
 # One can also change the threshold to make validation and test set take larger proportions.
-def train_test_split_by_time(df, timestamp, item):
+def train_test_split_by_time(df, timestamp, user):
    df['train_mask'] = np.ones((len(df),), dtype=np.bool)
    df['val_mask'] = np.zeros((len(df),), dtype=np.bool)
    df['test_mask'] = np.zeros((len(df),), dtype=np.bool)
@@ -22,8 +22,8 @@ def train_test_split_by_time(df, timestamp, item):
            df.iloc[-2, -3] = False
            df.iloc[-2, -2] = True
        return df
-    df = df.groupby(item).apply(train_test_split).compute(scheduler='processes').sort_index()
-    print(df[df[item] == df[item].unique()[0]].sort_values(timestamp))
+    df = df.groupby(user, group_keys=False).apply(train_test_split).compute(scheduler='processes').sort_index()
+    print(df[df[user] == df[user].unique()[0]].sort_values(timestamp))
    return df['train_mask'].to_numpy().nonzero()[0], \
           df['val_mask'].to_numpy().nonzero()[0], \
           df['test_mask'].to_numpy().nonzero()[0]

--- a/examples/pytorch/pinsage/process_movielens1m.py
+++ b/examples/pytorch/pinsage/process_movielens1m.py
@@ -116,10 +116,11 @@ if __name__ == '__main__':
    # Train-validation-test split
    # This is a little bit tricky as we want to select the last interaction for test, and the
    # second-to-last interaction for validation.
-    train_indices, val_indices, test_indices = train_test_split_by_time(ratings, 'timestamp', 'movie_id')
+    train_indices, val_indices, test_indices = train_test_split_by_time(ratings, 'timestamp', 'user_id')

    # Build the graph with training interactions only.
    train_g = build_train_graph(g, train_indices, 'user', 'movie', 'watched', 'watched-by')
+    assert train_g.out_degrees(etype='watched').min() > 0

    # Build the user-item sparse matrix for validation and test set.
    val_matrix, test_matrix = build_val_test_matrix(g, val_indices, test_indices, 'user', 'movie', 'watched')

--- a/examples/pytorch/pinsage/process_nowplaying_rs.py
+++ b/examples/pytorch/pinsage/process_nowplaying_rs.py
@@ -53,8 +53,9 @@ if __name__ == '__main__':
    g.edges['listened-by'].data['created_at'] = torch.LongTensor(events['created_at'].values)

    n_edges = g.number_of_edges('listened')
-    train_indices, val_indices, test_indices = train_test_split_by_time(events, 'created_at', 'track_id')
+    train_indices, val_indices, test_indices = train_test_split_by_time(events, 'created_at', 'user_id')
    train_g = build_train_graph(g, train_indices, 'user', 'track', 'listened', 'listened-by')
+    assert train_g.out_degrees(etype='listened').min() > 0
    val_matrix, test_matrix = build_val_test_matrix(
        g, val_indices, test_indices, 'user', 'track', 'listened')


--- a/examples/pytorch/pinsage/sampler.py
+++ b/examples/pytorch/pinsage/sampler.py
@@ -68,12 +68,10 @@ class NeighborSampler(object):
        # connections only.
        pos_graph = dgl.graph(
            (heads, tails),
-            num_nodes=self.g.number_of_nodes(self.item_type),
-            ntype=self.item_type)
+            num_nodes=self.g.number_of_nodes(self.item_type))
        neg_graph = dgl.graph(
            (heads, neg_tails),
-            num_nodes=self.g.number_of_nodes(self.item_type),
-            ntype=self.item_type)
+            num_nodes=self.g.number_of_nodes(self.item_type))
        pos_graph, neg_graph = dgl.compact_graphs([pos_graph, neg_graph])
        seeds = pos_graph.ndata[dgl.NID]