[GraphBolt] Modify examples to use `seeds`. (#7231)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>

[GraphBolt] Modify examples to use `seeds`. (#7231)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
efe28493 · yxy235 · GitHub · b725ee53 · efe28493 · efe28493
Unverified Commit efe28493 authored Apr 01, 2024 by yxy235 Committed by GitHub Apr 01, 2024
10 changed files
--- a/examples/sampling/graphbolt/lightning/node_classification.py
+++ b/examples/sampling/graphbolt/lightning/node_classification.py
@@ -205,7 +205,7 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

-    dataset = gb.BuiltinDataset("ogbn-products").load()
+    dataset = gb.BuiltinDataset("ogbn-products-seeds").load()
    datamodule = DataModule(
        dataset,
        [10, 10, 10],

--- a/examples/sampling/graphbolt/link_prediction.py
+++ b/examples/sampling/graphbolt/link_prediction.py
@@ -51,7 +51,7 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import tqdm
-from ogb.linkproppred import Evaluator
+from torchmetrics.retrieval import RetrievalMRR


 class SAGE(nn.Module):
@@ -243,43 +243,38 @@ def create_dataloader(args, graph, features, itemset, is_train=True):


 @torch.no_grad()
-def compute_mrr(args, model, evaluator, node_emb, src, dst, neg_dst):
+def compute_mrr(args, model, node_emb, seeds, labels, indexes):
    """Compute the Mean Reciprocal Rank (MRR) for given source and destination
    nodes.

    This function computes the MRR for a set of node pairs, dividing the task
    into batches to handle potentially large graphs.
    """
-    rr = torch.zeros(src.shape[0])
-    # Loop over node pairs in batches.
-    for start in tqdm.trange(
-        0, src.shape[0], args.eval_batch_size, desc="Evaluate"
-    ):
-        end = min(start + args.eval_batch_size, src.shape[0])

-        # Concatenate positive and negative destination nodes.
-        all_dst = torch.cat([dst[start:end, None], neg_dst[start:end]], 1)
+    preds = torch.empty(seeds.shape[0])
+    mrr = RetrievalMRR()
+    seeds_src, seeds_dst = seeds.T
+    # The constant number is 1001, due to negtive ratio in the `ogbl-citation2`
+    # dataset is 1000.
+    eval_size = args.eval_batch_size * 1001
+    # Loop over node pairs in batches.
+    for start in tqdm.trange(0, seeds_src.shape[0], eval_size, desc="Evaluate"):
+        end = min(start + eval_size, seeds_src.shape[0])

        # Fetch embeddings for current batch of source and destination nodes.
-        h_src = node_emb[src[start:end]][:, None, :].to(args.device)
-        h_dst = (
-            node_emb[all_dst.view(-1)].view(*all_dst.shape, -1).to(args.device)
-        )
+        h_src = node_emb[seeds_src[start:end]].to(args.device)
+        h_dst = node_emb[seeds_dst[start:end]].to(args.device)

        # Compute prediction scores using the model.
-        pred = model.predictor(h_src * h_dst).squeeze(-1)
-
-        # Evaluate the predictions to obtain MRR values.
-        input_dict = {"y_pred_pos": pred[:, 0], "y_pred_neg": pred[:, 1:]}
-        rr[start:end] = evaluator.eval(input_dict)["mrr_list"]
-    return rr.mean()
+        pred = model.predictor(h_src * h_dst).squeeze()
+        preds[start:end] = pred
+    return mrr(preds, labels, indexes=indexes)


 @torch.no_grad()
 def evaluate(args, model, graph, features, all_nodes_set, valid_set, test_set):
    """Evaluate the model on validation and test sets."""
    model.eval()
-    evaluator = Evaluator(name="ogbl-citation2")

    dataloader = create_dataloader(
        args, graph, features, all_nodes_set, is_train=False
@@ -292,13 +287,13 @@ def evaluate(args, model, graph, features, all_nodes_set, valid_set, test_set):
    # Loop over both validation and test sets.
    for split in [valid_set, test_set]:
        # Unpack the item set.
-        src = split._items[0][:, 0].to(node_emb.device)
-        dst = split._items[0][:, 1].to(node_emb.device)
-        neg_dst = split._items[1].to(node_emb.device)
+        seeds = split._items[0].to(node_emb.device)
+        labels = split._items[1].to(node_emb.device)
+        indexes = split._items[2].to(node_emb.device)

        # Compute MRR values for the current split.
        results.append(
-            compute_mrr(args, model, evaluator, node_emb, src, dst, neg_dst)
+            compute_mrr(args, model, node_emb, seeds, labels, indexes)
        )
    return results

@@ -313,7 +308,8 @@ def train(args, model, graph, features, train_set):
        start_epoch_time = time.time()
        for step, data in tqdm.tqdm(enumerate(dataloader)):
            # Get node pairs with labels for loss calculation.
-            compacted_pairs, labels = data.node_pairs_with_labels
+            compacted_seeds = data.compacted_seeds.T
+            labels = data.labels

            node_feature = data.node_features["feat"]
            blocks = data.blocks
@@ -321,7 +317,7 @@ def train(args, model, graph, features, train_set):
            # Get the embeddings of the input nodes.
            y = model(blocks, node_feature)
            logits = model.predictor(
-                y[compacted_pairs[0]] * y[compacted_pairs[1]]
+                y[compacted_seeds[0]] * y[compacted_seeds[1]]
            ).squeeze()

            # Compute loss.
@@ -389,7 +385,7 @@ def main(args):

    # Load and preprocess dataset.
    print("Loading data")
-    dataset = gb.BuiltinDataset("ogbl-citation2").load()
+    dataset = gb.BuiltinDataset("ogbl-citation2-seeds").load()

    # Move the dataset to the selected storage.
    if args.storage_device == "pinned":

--- a/examples/sampling/graphbolt/node_classification.py
+++ b/examples/sampling/graphbolt/node_classification.py
@@ -101,7 +101,7 @@ def create_dataloader(
    # ensures that the rest of the operations run on the GPU.
    ############################################################################
    if args.storage_device != "cpu":
-        datapipe = datapipe.copy_to(device=device, extra_attrs=["seed_nodes"])
+        datapipe = datapipe.copy_to(device=device, extra_attrs=["seeds"])

    ############################################################################
    # [Step-3]:
@@ -364,8 +364,12 @@ def parse_args():
    parser.add_argument(
        "--dataset",
        type=str,
-        default="ogbn-products",
-        choices=["ogbn-arxiv", "ogbn-products", "ogbn-papers100M"],
+        default="ogbn-products-seeds",
+        choices=[
+            "ogbn-arxiv-seeds",
+            "ogbn-products-seeds",
+            "ogbn-papers100M-seeds",
+        ],
        help="The dataset we can use for node classification example. Currently"
        " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
    )

--- a/examples/sampling/graphbolt/pyg/node_classification.py
+++ b/examples/sampling/graphbolt/pyg/node_classification.py
@@ -208,8 +208,9 @@ def main():
    parser.add_argument(
        "--dataset",
        type=str,
-        default="ogbn-products",
-        help='Name of the dataset to use (e.g., "ogbn-products", "ogbn-arxiv")',
+        default="ogbn-products-seeds",
+        help='Name of the dataset to use (e.g., "ogbn-products-seeds",'
+        + ' "ogbn-arxiv-seeds")',
    )
    parser.add_argument(
        "--epochs", type=int, default=10, help="Number of training epochs."

--- a/examples/sampling/graphbolt/pyg/node_classification_advanced.py
+++ b/examples/sampling/graphbolt/pyg/node_classification_advanced.py
@@ -175,7 +175,7 @@ def create_dataloader(
    )
    # Copy the data to the specified device.
    if args.graph_device != "cpu":
-        datapipe = datapipe.copy_to(device=device, extra_attrs=["seed_nodes"])
+        datapipe = datapipe.copy_to(device=device, extra_attrs=["seeds"])
    # Sample neighbors for each node in the mini-batch.
    datapipe = getattr(datapipe, args.sample_mode)(
        graph, fanout if job != "infer" else [-1]
@@ -320,8 +320,12 @@ def parse_args():
    parser.add_argument(
        "--dataset",
        type=str,
-        default="ogbn-products",
-        choices=["ogbn-arxiv", "ogbn-products", "ogbn-papers100M"],
+        default="ogbn-products-seeds",
+        choices=[
+            "ogbn-arxiv-seeds",
+            "ogbn-products-seeds",
+            "ogbn-papers100M-seeds",
+        ],
        help="The dataset we can use for node classification example. Currently"
        " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
    )

--- a/examples/sampling/graphbolt/quickstart/link_prediction.py
+++ b/examples/sampling/graphbolt/quickstart/link_prediction.py
@@ -85,7 +85,8 @@ def evaluate(model, dataset, device):
    labels = []
    for step, data in enumerate(dataloader):
        # Get node pairs with labels for loss calculation.
-        compacted_pairs, label = data.node_pairs_with_labels
+        compacted_seeds = data.compacted_seeds.T
+        label = data.labels

        # The features of sampled nodes.
        x = data.node_features["feat"]
@@ -94,7 +95,7 @@ def evaluate(model, dataset, device):
        y = model(data.blocks, x)
        logit = (
            model.predictor(
-                y[compacted_pairs[0].long()] * y[compacted_pairs[1].long()]
+                y[compacted_seeds[0].long()] * y[compacted_seeds[1].long()]
            )
            .squeeze()
            .detach()
@@ -126,7 +127,8 @@ def train(model, dataset, device):
        ########################################################################
        for step, data in enumerate(dataloader):
            # Get node pairs with labels for loss calculation.
-            compacted_pairs, labels = data.node_pairs_with_labels
+            compacted_seeds = data.compacted_seeds.T
+            labels = data.labels

            # The features of sampled nodes.
            x = data.node_features["feat"]
@@ -134,7 +136,7 @@ def train(model, dataset, device):
            # Forward.
            y = model(data.blocks, x)
            logits = model.predictor(
-                y[compacted_pairs[0].long()] * y[compacted_pairs[1].long()]
+                y[compacted_seeds[0].long()] * y[compacted_seeds[1].long()]
            ).squeeze()

            # Compute loss.
@@ -156,7 +158,7 @@ if __name__ == "__main__":

    # Load and preprocess dataset.
    print("Loading data...")
-    dataset = gb.BuiltinDataset("cora").load()
+    dataset = gb.BuiltinDataset("cora-seeds").load()

    # If a CUDA device is selected, we pin the graph and the features so that
    # the GPU can access them.

--- a/examples/sampling/graphbolt/quickstart/node_classification.py
+++ b/examples/sampling/graphbolt/quickstart/node_classification.py
@@ -18,7 +18,7 @@ def create_dataloader(dataset, itemset, device):
    datapipe = gb.ItemSampler(itemset, batch_size=16)

    # Copy the mini-batch to the designated device for sampling and training.
-    datapipe = datapipe.copy_to(device, extra_attrs=["seed_nodes"])
+    datapipe = datapipe.copy_to(device, extra_attrs=["seeds"])

    # Sample neighbors for the seed nodes.
    datapipe = datapipe.sample_neighbor(dataset.graph, fanouts=[4, 2])
@@ -117,7 +117,7 @@ if __name__ == "__main__":

    # Load and preprocess dataset.
    print("Loading data...")
-    dataset = gb.BuiltinDataset("cora").load()
+    dataset = gb.BuiltinDataset("cora-seeds").load()

    # If a CUDA device is selected, we pin the graph and the features so that
    # the GPU can access them.

--- a/examples/sampling/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/sampling/graphbolt/rgcn/hetero_rgcn.py
@@ -117,7 +117,7 @@ def create_dataloader(
    # Move the mini-batch to the appropriate device.
    # `device`:
    #   The device to move the mini-batch to.
-    datapipe = datapipe.copy_to(device, extra_attrs=["seed_nodes"])
+    datapipe = datapipe.copy_to(device, extra_attrs=["seeds"])

    # Sample neighbors for each seed node in the mini-batch.
    # `graph`:
@@ -153,7 +153,7 @@ def extract_embed(node_embed, input_nodes):

 def extract_node_features(name, block, data, node_embed, device):
    """Extract the node features from embedding layer or raw features."""
-    if name == "ogbn-mag":
+    if name == "ogbn-mag-seeds":
        input_nodes = {
            k: v.to(device) for k, v in block.srcdata[dgl.NID].items()
        }
@@ -419,8 +419,8 @@ def evaluate(
    model.eval()
    category = "paper"
    # An evaluator for the dataset.
-    if name == "ogbn-mag":
-        evaluator = Evaluator(name=name)
+    if name == "ogbn-mag-seeds":
+        evaluator = Evaluator(name="ogbn-mag")
    else:
        evaluator = MAG240MEvaluator()

@@ -578,7 +578,7 @@ def main(args):
    # `institution` are generated in advance and stored in the feature store.
    # For `ogbn-mag`, we generate the features on the fly.
    embed_layer = None
-    if args.dataset == "ogbn-mag":
+    if args.dataset == "ogbn-mag-seeds":
        # Create the embedding layer and move it to the appropriate device.
        embed_layer = rel_graph_embed(g, feat_size).to(device)
        print(
@@ -652,9 +652,9 @@ if __name__ == "__main__":
    parser.add_argument(
        "--dataset",
        type=str,
-        default="ogbn-mag",
-        choices=["ogbn-mag", "ogb-lsc-mag240m"],
-        help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m",
+        default="ogbn-mag-seeds",
+        choices=["ogbn-mag-seeds", "ogb-lsc-mag240m"],
+        help="Dataset name. Possible values: ogbn-mag-seeds, ogb-lsc-mag240m",
    )
    parser.add_argument("--num_epochs", type=int, default=3)
    parser.add_argument("--num_workers", type=int, default=0)

--- a/examples/sampling/graphbolt/sparse/graphsage.py
+++ b/examples/sampling/graphbolt/sparse/graphsage.py
@@ -116,7 +116,7 @@ class SparseNeighborSampler(SubgraphSampler):

    def sample_subgraphs(self, seeds, seeds_timestamp=None):
        sampled_matrices = []
-        src = seeds
+        src = seeds.long()

        #####################################################################
        # (HIGHLIGHT) Using the sparse sample operator to preform random
@@ -242,7 +242,7 @@ if __name__ == "__main__":
    # Load and preprocess dataset.
    print("Loading data")
    device = torch.device("cpu" if args.mode == "cpu" else "cuda")
-    dataset = gb.BuiltinDataset("ogbn-products").load()
+    dataset = gb.BuiltinDataset("ogbn-products-seeds").load()
    g = dataset.graph
    features = dataset.feature

@@ -254,7 +254,7 @@ if __name__ == "__main__":

    # Create sparse.
    N = g.num_nodes
-    A = dglsp.from_csc(g.csc_indptr, g.indices, shape=(N, N))
+    A = dglsp.from_csc(g.csc_indptr.long(), g.indices.long(), shape=(N, N))

    # Model training.
    print("Training...")

--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -996,12 +996,22 @@ class BuiltinDataset(OnDiskDataset):
    )
    _datasets = [
        "cora",
+        "cora-seeds",
        "ogbn-mag",
+        "ogbn-mag-seeds",
        "ogbl-citation2",
+        "ogbl-citation2-seeds",
        "ogbn-products",
+        "ogbn-products-seeds",
        "ogbn-arxiv",
+        "ogbn-arxiv-seeds",
+    ]
+    _large_datasets = [
+        "ogb-lsc-mag240m",
+        "ogb-lsc-mag240m-seeds",
+        "ogbn-papers100M",
+        "ogbn-papers100M-seeds",
    ]
-    _large_datasets = ["ogb-lsc-mag240m", "ogbn-papers100M"]
    _all_datasets = _datasets + _large_datasets

    def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset: