[GraphBolt] Update multi-gpu example for regression benchmark. (#6519)

f5c8c0b5 · Ramon Zhou · GitHub · 84a01a16 · f5c8c0b5
Unverified Commit f5c8c0b5 authored Nov 06, 2023 by Ramon Zhou Committed by GitHub Nov 06, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 3 deletions

examples/multigpu/graphbolt/node_classification.py examples/multigpu/graphbolt/node_classification.py +17 -3

No files found.
--- a/examples/multigpu/graphbolt/node_classification.py
+++ b/examples/multigpu/graphbolt/node_classification.py
@@ -38,6 +38,7 @@ main
 """
 import argparse
 import os
+import time
 import dgl.graphbolt as gb
 import dgl.nn as dglnn
@@ -138,7 +139,9 @@ def create_dataloader(
    # A CopyTo object copying data in the datapipe to a specified device.\
    ############################################################################
    datapipe = datapipe.copy_to(device)
-    dataloader = gb.SingleProcessDataLoader(datapipe)
+    dataloader = gb.MultiProcessDataLoader(
+        datapipe, num_workers=args.num_workers
+    )
    # Return the fully-initialized DataLoader object.
    return dataloader
@@ -204,6 +207,8 @@ def train(
    )
    for epoch in range(args.epochs):
+        epoch_start = time.time()
        model.train()
        total_loss = torch.tensor(0, dtype=torch.float).to(device)
        ########################################################################
@@ -273,11 +278,14 @@ def train(
        dist.reduce(tensor=acc, dst=0)
        total_loss /= step + 1
        dist.reduce(tensor=total_loss, dst=0)
+        epoch_end = time.time()
        if rank == 0:
            print(
                f"Epoch {epoch:05d} | "
                f"Average Loss {total_loss.item() / world_size:.4f} | "
-                f"Accuracy {acc.item():.4f} "
+                f"Accuracy {acc.item():.4f} | "
+                f"Time {epoch_end - epoch_start:.4f}"
            )
@@ -342,7 +350,7 @@ def run(rank, world_size, args, devices, dataset):
    )
    dist.reduce(tensor=test_acc, dst=0)
    if rank == 0:
-        print(f"Test Accuracy is {test_acc.item():.4f}")
+        print(f"Test Accuracy {test_acc.item():.4f}")
 def parse_args():
@@ -376,6 +384,9 @@ def parse_args():
        help="Fan-out of neighbor sampling. It is IMPORTANT to keep len(fanout)"
        " identical with the number of layers in your model. Default: 15,10,5",
    )
+    parser.add_argument(
+        "--num-workers", type=int, default=0, help="The number of processes."
+    )
    return parser.parse_args()
@@ -393,6 +404,9 @@ if __name__ == "__main__":
    # Load and preprocess dataset.
    dataset = gb.BuiltinDataset("ogbn-products").load()
+    # Thread limiting to avoid resource competition.
+    os.environ["OMP_NUM_THREADS"] = str(mp.cpu_count() // 2 // world_size)
    mp.set_sharing_strategy("file_system")
    mp.spawn(
        run,