[Bug] Fix multiple issues in distributed multi-GPU GraphSAGE example (#3870)

* fix distributed multi-GPU example device * try Join * update version requirement in README * use model.join * fix docs Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>

[Bug] Fix multiple issues in distributed multi-GPU GraphSAGE example (#3870)
* fix distributed multi-GPU example device * try Join * update version requirement in README * use model.join * fix docs Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
7d416086 · Quan (Andy) Gan · GitHub · 16409ff8 · 7d416086 · 7d416086
Unverified Commit 7d416086 authored Mar 25, 2022 by Quan (Andy) Gan Committed by GitHub Mar 25, 2022
4 changed files
--- a/docs/source/guide/distributed.rst
+++ b/docs/source/guide/distributed.rst
@@ -53,6 +53,7 @@ are the same as :ref:`mini-batch training <guide-minibatch>`.

    # training loop
    for epoch in range(args.num_epochs):
+        with model.join():
            for step, blocks in enumerate(dataloader):
                batch_inputs, batch_labels = load_subtensor(g, blocks[0].srcdata[dgl.NID],
                                                            blocks[-1].dstdata[dgl.NID])

--- a/examples/pytorch/graphsage/dist/README.md
+++ b/examples/pytorch/graphsage/dist/README.md
@@ -6,6 +6,8 @@ This is an example of training GraphSage in a distributed fashion. Before traini
 sudo pip3 install ogb
 ```

+**Requires PyTorch 1.10.0+ to work.**
+
 To train GraphSage, it has five steps:

 ### Step 0: Setup a Distributed File System

--- a/examples/pytorch/graphsage/dist/train_dist.py
+++ b/examples/pytorch/graphsage/dist/train_dist.py
@@ -20,6 +20,7 @@ import torch.nn.functional as F
 import torch.optim as optim
 import torch.multiprocessing as mp
 from torch.utils.data import DataLoader
+import socket

 def load_subtensor(g, seeds, input_nodes, device, load_feat=True):
    """
@@ -155,41 +156,11 @@ def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device):
    model.train()
    return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid])

-def pad_data(nids, device):
-    """
-    In distributed traning scenario, we need to make sure that each worker has same number of
-    batches. Otherwise the synchronization(barrier) is called diffirent times, which results in
-    the worker with more batches hangs up.
-
-    This function pads the nids to the same size for all workers, by repeating the head ids till
-    the maximum size among all workers.
-    """
-    import torch.distributed as dist
-    # NCCL backend only supports GPU tensors, thus here we need to allocate it to gpu
-    num_nodes = th.tensor(nids.numel()).to(device)
-    dist.all_reduce(num_nodes, dist.ReduceOp.MAX)
-    max_num_nodes = int(num_nodes)
-    nids_length = nids.shape[0]
-    if max_num_nodes > nids_length:
-        pad_size = max_num_nodes % nids_length
-        repeat_size = max_num_nodes // nids_length
-        new_nids = th.cat([nids for _ in range(repeat_size)] + [nids[:pad_size]], axis=0)
-        print("Pad nids from {} to {}".format(nids_length, max_num_nodes))
-    else:
-        new_nids = nids
-    assert new_nids.shape[0] == max_num_nodes
-    return new_nids
-

 def run(args, device, data):
    # Unpack data
    train_nid, val_nid, test_nid, in_feats, n_classes, g = data
    shuffle = True
-    if args.pad_data:
-        train_nid = pad_data(train_nid, device)
-        # Current pipeline doesn't support duplicate node id within the same batch
-        # Therefore turn off shuffling to avoid potential duplicate node id within the same batch
-        shuffle = False
    # Create sampler
    sampler = NeighborSampler(g, [int(fanout) for fanout in args.fan_out.split(',')],
                              dgl.distributed.sample_neighbors, device)
@@ -209,8 +180,7 @@ def run(args, device, data):
        if args.num_gpus == -1:
            model = th.nn.parallel.DistributedDataParallel(model)
        else:
-            dev_id = g.rank() % args.num_gpus
-            model = th.nn.parallel.DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)
+            model = th.nn.parallel.DistributedDataParallel(model, device_ids=[device], output_device=device)
    loss_fcn = nn.CrossEntropyLoss()
    loss_fcn = loss_fcn.to(device)
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
@@ -233,6 +203,8 @@ def run(args, device, data):
        # Loop over the dataloader to sample the computation dependency graph as a list of
        # blocks.
        step_time = []
+
+        with model.join():
            for step, blocks in enumerate(dataloader):
                tic_step = time.time()
                sample_time += tic_step - start
@@ -249,6 +221,7 @@ def run(args, device, data):
                batch_labels = batch_labels.to(device)
                # Compute loss and prediction
                start = time.time()
+                #print(g.rank(), blocks[0].device, model.module.layers[0].fc_neigh.weight.device, dev_id)
                batch_pred = model(blocks, batch_inputs)
                loss = loss_fcn(batch_pred, batch_labels)
                forward_end = time.time()
@@ -285,11 +258,14 @@ def run(args, device, data):
                                                                                  time.time() - start))

 def main(args):
+    print(socket.gethostname(), 'Initializing DGL dist')
    dgl.distributed.initialize(args.ip_config)
    if not args.standalone:
+        print(socket.gethostname(), 'Initializing DGL process group')
        th.distributed.init_process_group(backend=args.backend)
+    print(socket.gethostname(), 'Initializing DistGraph')
    g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
-    print('rank:', g.rank())
+    print(socket.gethostname(), 'rank:', g.rank())

    pb = g.get_partition_book()
    if 'trainer_id' in g.ndata:
@@ -311,7 +287,8 @@ def main(args):
    if args.num_gpus == -1:
        device = th.device('cpu')
    else:
-        device = th.device('cuda:'+str(args.local_rank))
+        dev_id = g.rank() % args.num_gpus
+        device = th.device('cuda:'+str(dev_id))
    labels = g.ndata['labels'][np.arange(g.number_of_nodes())]
    n_classes = len(th.unique(labels[th.logical_not(th.isnan(labels))]))
    print('#labels:', n_classes)

--- a/tutorials/dist/1_node_classification.py
+++ b/tutorials/dist/1_node_classification.py
@@ -299,6 +299,7 @@ The training loop for distributed training is also exactly the same as the singl
    for epoch in range(10):
        # Loop over the dataloader to sample mini-batches.
        losses = []
+        with model.join():
            for step, (input_nodes, seeds, blocks) in enumerate(train_dataloader):
                # Load the input features as well as output labels
                batch_inputs = g.ndata['feat'][input_nodes]
@@ -315,7 +316,7 @@ The training loop for distributed training is also exactly the same as the singl
        # validation
        predictions = []
        labels = []
-        with th.no_grad():
+        with th.no_grad(), model.join():
            for step, (input_nodes, seeds, blocks) in enumerate(valid_dataloader):
                inputs = g.ndata['feat'][input_nodes]
                labels.append(g.ndata['labels'][seeds].numpy())