[Graphbolt] fix evaluate bug and update readme in link prediction example. (#6367)

Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>

[Graphbolt] fix evaluate bug and update readme in link prediction example. (#6367)
Co-authored-by: Hongzhi (Steve), Chen <chenhongzhi.nkcs@gmail.com>
7e687d5d · LastWhisper · GitHub · 614401ff · 7e687d5d · 7e687d5d
Unverified Commit 7e687d5d authored Sep 26, 2023 by LastWhisper Committed by GitHub Sep 26, 2023
Showing with 30 additions and 112 deletions

examples/sampling/graphbolt/README.md examples/sampling/graphbolt/README.md +2 -94

examples/sampling/graphbolt/link_prediction.py examples/sampling/graphbolt/link_prediction.py +28 -18

No files found.
--- a/examples/sampling/graphbolt/README.md
+++ b/examples/sampling/graphbolt/README.md
-## How to make your dataset?
-
-Utilize the example provided in https://ogb.stanford.edu/docs/linkprop/ to download the ogbl-citation2 dataset.
-
-```python
-import torch
-from ogb.linkproppred import LinkPropPredDataset
-
-dataset_name = "ogbl-citation2"
-# Set the download directory
-data_root = "./dataset" 
-
-# Download.
-dataset = LinkPropPredDataset(name=dataset_name, root=data_root)
-```
-
-After running the code above, navigate to the respective dataset folder and look for `ogbl_citation2`; all the data you need can be found there. Below is the `metadata.yaml` file we're currently using:
-
-```yaml
-dataset_name: ogbl_citation2 
-graph:
-  nodes:
-    - num: 2927963
-  edges:
-    - format: csv
-      path: edges/cite.csv
-  feature_data:
-feature_data:
-  - domain: node
-    type: null
-    name: feat
-    format: numpy
-    in_memory: true
-    path: data/node-feat.npy
-  - domain: node
-    type: null
-    name: year
-    format: numpy
-    in_memory: true
-    path: data/node-year.npy
-tasks:
-  - name: "link_prediction"
-    num_classes: 2
-    train_set:
-      - type_name: null
-        data:
-        # (n, 2)
-        - name: node_pairs
-          format: numpy
-          path: set/train_node_pairs.npy
-          in_memory: true
-    validation_set:
-      - type_name: null
-        data:
-        - name: node_pairs
-          format: numpy
-          path: set/valid_node_pairs.npy
-          in_memory: true
-        - name: negative_dsts
-          format: numpy
-          path: set/valid_negative_dsts.npy
-          in_memory: true
-    test_set:
-      - type_name: null
-        data:
-        - name: node_pairs
-          format: numpy
-          path: set/test_node_pairs.npy
-          in_memory: true
-        - name: negative_dsts
-          format: numpy
-          path: set/test_negative_dsts.npy
-          in_memory: true
-```
-
-You'll need to convert the **raw dataset** into the corresponding structure and organize it into any folder of your choice. The final file structure should look like this:
-
-```
-.
-├── data
-│   ├── node-feat.npy
-│   └── node-year.npy
-├── edges
-│   └── cite.csv
-├── metadata.yaml
-└── set
-    ├── test_negative_dsts.npy
-    ├── test_node_pairs.npy
-    ├── train_node_pairs.npy
-    ├── valid_negative_dsts.npy
-    └── valid_node_pairs.npy
-```
-
 ## How to run the code?

 ```bash
@@ -99,5 +6,6 @@ python link_prediction.py

 Results (10 epochs):
 ```
-<Wait for adding>
+Valid MRR 0.7040
+Test MRR 0.7043
 ```
\ No newline at end of file
--- a/examples/sampling/graphbolt/link_prediction.py
+++ b/examples/sampling/graphbolt/link_prediction.py
@@ -139,6 +139,22 @@ def create_dataloader(args, graph, features, itemset, is_train=True):
    ############################################################################
    datapipe = datapipe.sample_neighbor(graph, args.fanout)

+    ############################################################################
+    # [Input]:
+    # 'gb.exclude_seed_edges': Function to exclude seed edges, optionally
+    # including their reverse edges, from the sampled subgraphs in the
+    # minibatch.
+    # [Output]:
+    # A MiniBatchTransformer object with excluded seed edges.
+    # [Role]:
+    # During the training phase of link prediction, negative edges are
+    # sampled. It's essential to exclude the seed edges from the process
+    # to ensure that positive samples are not inadvertently included within
+    # the negative samples.
+    ############################################################################
+    if is_train:
+        datapipe = datapipe.transform(gb.exclude_seed_edges)
+
    ############################################################################
    # [Input]:
    # 'features': The node features.
@@ -200,27 +216,13 @@ def to_binary_link_dgl_computing_pack(data: gb.MiniBatch):
    return (node_pairs, labels.float())


-# TODO[Keli]: Remove this helper function later.
-def to_dgl_blocks(sampled_subgraphs: gb.SampledSubgraphImpl):
-    """Convert sampled subgraphs to DGL blocks."""
-    blocks = [
-        dgl.create_block(
-            sampled_subgraph.node_pairs,
-            num_src_nodes=sampled_subgraph.original_row_node_ids.shape[0],
-            num_dst_nodes=sampled_subgraph.original_column_node_ids.shape[0],
-        )
-        for sampled_subgraph in sampled_subgraphs
-    ]
-    return blocks
-
-
 @torch.no_grad()
 def evaluate(args, graph, features, itemset, model):
    evaluator = Evaluator(name="ogbl-citation2")

    # Since we need to evaluate the model, we need to set the number
-    # of layers to 1 and the fanout to -1.
-    args.fanout = [torch.LongTensor([-1])]
+    # of layers to 3 and the fanout to -1.
+    args.fanout = [-1] * 3
    dataloader = create_dataloader(
        args, graph, features, itemset, is_train=False
    )
@@ -232,7 +234,7 @@ def evaluate(args, graph, features, itemset, model):
        # Unpack MiniBatch.
        compacted_pairs, _ = to_binary_link_dgl_computing_pack(data)
        node_feature = data.node_features["feat"].float()
-        blocks = to_dgl_blocks(data.sampled_subgraphs)
+        blocks = data.to_dgl_blocks()

        # Get the embeddings of the input nodes.
        y = model(blocks, node_feature)
@@ -270,7 +272,7 @@ def train(args, graph, features, train_set, valid_set, model):
            compacted_pairs, labels = to_binary_link_dgl_computing_pack(data)
            node_feature = data.node_features["feat"].float()
            # Convert sampled subgraphs to DGL blocks.
-            blocks = to_dgl_blocks(data.sampled_subgraphs)
+            blocks = data.to_dgl_blocks()

            # Get the embeddings of the input nodes.
            y = model(blocks, node_feature)
@@ -292,6 +294,8 @@ def train(args, graph, features, train_set, valid_set, model):
                    f"Loss {(total_loss) / (step + 1):.4f}",
                    end="\n",
                )
+            if step + 1 == args.early_stop:
+                break

    # Evaluate the model.
    print("Validation")
@@ -306,6 +310,12 @@ def parse_args():
    parser.add_argument("--neg-ratio", type=int, default=1)
    parser.add_argument("--batch-size", type=int, default=512)
    parser.add_argument("--num-workers", type=int, default=4)
+    parser.add_argument(
+        "--early-stop",
+        type=int,
+        default=0,
+        help="0 means no early stop, otherwise stop at the input-th step",
+    )
    parser.add_argument(
        "--fanout",
        type=str,