Improving the PinSAGE example. (#6067)

ec7137dd · Andrei Ivanov · GitHub · 562a1c87 · ec7137dd · ec7137dd
Unverified Commit ec7137dd authored Aug 02, 2023 by Andrei Ivanov Committed by GitHub Aug 03, 2023
Showing with 23 additions and 27 deletions

examples/pytorch/pinsage/data_utils.py examples/pytorch/pinsage/data_utils.py +12 -1

examples/pytorch/pinsage/process_movielens1m.py examples/pytorch/pinsage/process_movielens1m.py +11 -26

No files found.
--- a/examples/pytorch/pinsage/data_utils.py
+++ b/examples/pytorch/pinsage/data_utils.py
@@ -26,9 +26,20 @@ def train_test_split_by_time(df, timestamp, user):
            df.iloc[-2, -2] = True
        return df

+    meta_df = {
+        "user_id": np.int64,
+        "movie_id": np.int64,
+        "rating": np.int64,
+        "timestamp": np.int64,
+        "user_id": np.int64,
+        "train_mask": bool,
+        "val_mask": bool,
+        "test_mask": bool,
+    }
+
    df = (
        df.groupby(user, group_keys=False)
-        .apply(train_test_split)
+        .apply(train_test_split, meta=meta_df)
        .compute(scheduler="processes")
        .sort_index()
    )

--- a/examples/pytorch/pinsage/process_movielens1m.py
+++ b/examples/pytorch/pinsage/process_movielens1m.py
@@ -113,37 +113,22 @@ if __name__ == "__main__":

    # Assign features.
    # Note that variable-sized features such as texts or images are handled elsewhere.
-    g.nodes["user"].data["gender"] = torch.LongTensor(
-        users["gender"].cat.codes.values
-    )
-    g.nodes["user"].data["age"] = torch.LongTensor(
-        users["age"].cat.codes.values
-    )
-    g.nodes["user"].data["occupation"] = torch.LongTensor(
-        users["occupation"].cat.codes.values
-    )
-    g.nodes["user"].data["zip"] = torch.LongTensor(
-        users["zip"].cat.codes.values
+    for data_type in ["gender", "age", "occupation", "zip"]:
+        g.nodes["user"].data[data_type] = torch.LongTensor(
+            np.array(users[data_type].cat.codes.values)
        )

    g.nodes["movie"].data["year"] = torch.LongTensor(
-        movies["year"].cat.codes.values
+        np.array(movies["year"].cat.codes.values)
    )
    g.nodes["movie"].data["genre"] = torch.FloatTensor(
-        movies[genre_columns].values
+        np.array(movies[genre_columns].values)
    )

-    g.edges["watched"].data["rating"] = torch.LongTensor(
-        ratings["rating"].values
-    )
-    g.edges["watched"].data["timestamp"] = torch.LongTensor(
-        ratings["timestamp"].values
-    )
-    g.edges["watched-by"].data["rating"] = torch.LongTensor(
-        ratings["rating"].values
-    )
-    g.edges["watched-by"].data["timestamp"] = torch.LongTensor(
-        ratings["timestamp"].values
+    for edge_type in ["watched", "watched-by"]:
+        for data_type in ["rating", "timestamp"]:
+            g.edges[edge_type].data[data_type] = torch.LongTensor(
+                np.array(ratings[data_type].values)
            )

    # Train-validation-test split