[Example] modify dgl rgcn example to make it consistent with graphbolt's (#6882)

d4b5ddcd · Mingbang Wang · GitHub · 397b7599 · d4b5ddcd · d4b5ddcd
Unverified Commit d4b5ddcd authored Jan 03, 2024 by Mingbang Wang Committed by GitHub Jan 03, 2024
Show whitespace changes
Inline Side-by-side

Showing with 90 additions and 153 deletions

examples/core/rgcn/README.md examples/core/rgcn/README.md +4 -6

examples/core/rgcn/hetero_rgcn.py examples/core/rgcn/hetero_rgcn.py +86 -147

No files found.
--- a/examples/core/rgcn/README.md
+++ b/examples/core/rgcn/README.md
@@ -25,12 +25,10 @@ Below results are roughly collected from an AWS EC2 **g4dn.metal**, 384GB RAM, 9

 ### Accuracies
 ```
-Final performance: 
-All runs:
-Highest Train: 83.22 ± 0.00
-Highest Valid: 48.25 ± 0.20
-  Final Train: 68.45 ± 9.81
-   Final Test: 47.51 ± 0.19
+Epoch: 01, Loss: 2.3625, Valid: 48.25%, Test: 47.91%, Time 86.0210
+Epoch: 02, Loss: 1.5852, Valid: 48.56%, Test: 46.98%, Time 84.2728
+Epoch: 03, Loss: 1.1974, Valid: 45.99%, Test: 44.05%, Time 85.7916
+Test accuracy 44.1165
 ```

 ## Run on `ogb-lsc-mag240m` dataset

--- a/examples/core/rgcn/hetero_rgcn.py
+++ b/examples/core/rgcn/hetero_rgcn.py
@@ -46,9 +46,11 @@ main
                  │
                  └───> EntityClassify.evaluate
 """
+
 import argparse
 import itertools
 import sys
+import time

 import dgl
 import dgl.nn as dglnn
@@ -56,7 +58,7 @@ import numpy as np

 import psutil

-import torch as th
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from dgl import AddReverse, Compose, ToSimple
@@ -83,21 +85,22 @@ def prepare_data(args, device):
        # Apply transformation to the graph.
        # - "ToSimple()" removes multi-edge between two nodes.
        # - "AddReverse()" adds reverse edges to the graph.
+        print("Start to transform graph. This may take a while...")
        transform = Compose([ToSimple(), AddReverse()])
        g = transform(g)
    else:
        dataset = MAG240MDataset(root=args.rootdir)
        (g,), _ = dgl.load_graphs(args.graph_path)
        g = g.formats(["csc"])
-        labels = th.as_tensor(dataset.paper_label).long()
+        labels = torch.as_tensor(dataset.paper_label).long()
        # As feature data is too large to fit in memory, we read it from disk.
-        feats["paper"] = th.as_tensor(
+        feats["paper"] = torch.as_tensor(
            np.load(args.paper_feature_path, mmap_mode="r+")
        )
-        feats["author"] = th.as_tensor(
+        feats["author"] = torch.as_tensor(
            np.load(args.author_feature_path, mmap_mode="r+")
        )
-        feats["institution"] = th.as_tensor(
+        feats["institution"] = torch.as_tensor(
            np.load(args.inst_feature_path, mmap_mode="r+")
        )
    print(f"Loaded graph: {g}")
@@ -356,57 +359,6 @@ class EntityClassify(nn.Module):
        return h


-class Logger(object):
-    r"""
-    This class was taken directly from the PyG implementation and can be found
-    here: https://github.com/snap-stanford/ogb/blob/master/examples/nodeproppre
-    d/mag/logger.py
-
-    This was done to ensure that performance was measured in precisely the same
-    way
-    """
-
-    def __init__(self, runs):
-        self.results = [[] for _ in range(runs)]
-
-    def add_result(self, run, result):
-        assert len(result) == 3
-        assert run >= 0 and run < len(self.results)
-        self.results[run].append(result)
-
-    def print_statistics(self, run=None):
-        if run is not None:
-            result = 100 * th.tensor(self.results[run])
-            argmax = result[:, 1].argmax().item()
-            print(f"Run {run + 1:02d}:")
-            print(f"Highest Train: {result[:, 0].max():.2f}")
-            print(f"Highest Valid: {result[:, 1].max():.2f}")
-            print(f"  Final Train: {result[argmax, 0]:.2f}")
-            print(f"   Final Test: {result[argmax, 2]:.2f}")
-        else:
-            result = 100 * th.tensor(self.results)
-
-            best_results = []
-            for r in result:
-                train1 = r[:, 0].max().item()
-                valid = r[:, 1].max().item()
-                train2 = r[r[:, 1].argmax(), 0].item()
-                test = r[r[:, 1].argmax(), 2].item()
-                best_results.append((train1, valid, train2, test))
-
-            best_result = th.tensor(best_results)
-
-            print("All runs:")
-            r = best_result[:, 0]
-            print(f"Highest Train: {r.mean():.2f} ± {r.std():.2f}")
-            r = best_result[:, 1]
-            print(f"Highest Valid: {r.mean():.2f} ± {r.std():.2f}")
-            r = best_result[:, 2]
-            print(f"  Final Train: {r.mean():.2f} ± {r.std():.2f}")
-            r = best_result[:, 3]
-            print(f"   Final Test: {r.mean():.2f} ± {r.std():.2f}")
-
-
 def extract_node_features(name, g, input_nodes, node_embed, feats, device):
    """Extract the node features from embedding layer or raw features."""
    if name == "ogbn-mag":
@@ -440,17 +392,16 @@ def train(
    train_loader,
    split_idx,
    labels,
-    logger,
    device,
-    run,
 ):
-    print("start training...")
+    print("Start training...")
    category = "paper"

    # Typically, the best Validation performance is obtained after
    # the 1st or 2nd epoch. This is why the max epoch is set to 3.
    for epoch in range(3):
        num_train = split_idx["train"][category].shape[0]
+        t0 = time.time()
        model.train()

        total_loss = 0
@@ -482,19 +433,10 @@ def train(

            total_loss += loss.item() * batch_size

+        t1 = time.time()
        loss = total_loss / num_train

-        # Evaluate the model on the train/val/test set.
-        train_acc = evaluate(
-            dataset,
-            g,
-            feats,
-            model,
-            node_embed,
-            labels,
-            device,
-            split_idx["train"],
-        )
+        # Evaluate the model on the val/test set.
        valid_acc = evaluate(
            dataset,
            g,
@@ -517,20 +459,16 @@ def train(
            split_idx[test_key],
            save_test_submission=(dataset == "ogb-lsc-mag240m"),
        )
-        logger.add_result(run, (train_acc, valid_acc, test_acc))
        print(
-            f"Run: {run + 1:02d}, "
            f"Epoch: {epoch +1 :02d}, "
            f"Loss: {loss:.4f}, "
-            f"Train: {100 * train_acc:.2f}%, "
            f"Valid: {100 * valid_acc:.2f}%, "
-            f"Test: {100 * test_acc:.2f}%"
+            f"Test: {100 * test_acc:.2f}%, "
+            f"Time {t1 - t0:.4f}"
        )

-    return logger
-

-@th.no_grad()
+@torch.no_grad()
 def evaluate(
    dataset,
    g,
@@ -580,9 +518,9 @@ def evaluate(
        y_hats.append(y_hat.cpu())
        y_true.append(labels[seeds["paper"].cpu()])

-    y_pred = th.cat(y_hats, dim=0)
-    y_true = th.cat(y_true, dim=0)
-    y_true = th.unsqueeze(y_true, 1)
+    y_pred = torch.cat(y_hats, dim=0)
+    y_true = torch.cat(y_true, dim=0)
+    y_true = torch.unsqueeze(y_true, 1)

    if dataset == "ogb-lsc-mag240m":
        y_pred = y_pred.view(-1)
@@ -596,10 +534,9 @@ def evaluate(


 def main(args):
-    device = "cuda:0" if th.cuda.is_available() and args.num_gpus > 0 else "cpu"
-
-    # Initialize a logger.
-    logger = Logger(args.runs)
+    device = (
+        "cuda:0" if torch.cuda.is_available() and args.num_gpus > 0 else "cpu"
+    )

    # Prepare the data.
    g, labels, num_classes, split_idx, train_loader, feats = prepare_data(
@@ -625,7 +562,6 @@ def main(args):
        f"{sum(p.numel() for p in model.parameters())}"
    )

-    for run in range(args.runs):
    try:
        if embed_layer is not None:
            embed_layer.reset_parameters()
@@ -652,7 +588,7 @@ def main(args):
        model.parameters(),
        [] if embed_layer is None else embed_layer.parameters(),
    )
-        optimizer = th.optim.Adam(all_params, lr=0.01)
+    optimizer = torch.optim.Adam(all_params, lr=0.01)

    # `expected_max`` is the number of physical cores on your machine.
    # The `logical` parameter, when set to False, ensures that the count
@@ -665,7 +601,7 @@ def main(args):
            f"cores, please set any number less than {expected_max}",
            file=sys.stderr,
        )
-        logger = train(
+    train(
        args.dataset,
        g,
        feats,
@@ -675,14 +611,23 @@ def main(args):
        train_loader,
        split_idx,
        labels,
-            logger,
        device,
-            run,
    )
-        logger.print_statistics(run)

-    print("Final performance: ")
-    logger.print_statistics()
+    print("Testing...")
+    test_key = "test" if args.dataset == "ogbn-mag" else "test-dev"
+    test_acc = evaluate(
+        args.dataset,
+        g,
+        feats,
+        model,
+        embed_layer,
+        labels,
+        device,
+        split_idx[test_key],
+        save_test_submission=(args.dataset == "ogb-lsc-mag240m"),
+    )
+    print(f"Test accuracy {test_acc*100:.4f}")


 if __name__ == "__main__":
@@ -699,12 +644,6 @@ if __name__ == "__main__":
        default=0,
        help="Number of GPUs. Use 0 for CPU training.",
    )
-    parser.add_argument(
-        "--runs",
-        type=int,
-        default=5,
-        help="Number of runs. Each run will train the model from scratch.",
-    )
    parser.add_argument(
        "--num_workers",
        type=int,
@@ -714,7 +653,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--rootdir",
        type=str,
-        default="./",
+        default="./dataset/",
        help="Directory to download the OGB dataset.",
    )
    parser.add_argument(