Fix for distributed training (#3542)

* tmp fix * add description

Fix for distributed training (#3542)
* tmp fix * add description
987db374 · Jinjing Zhou · GitHub · eb08ef38 · 987db374
Unverified Commit 987db374 authored Dec 06, 2021 by Jinjing Zhou Committed by GitHub Dec 06, 2021
Show whitespace changes
Inline Side-by-side

Showing with 26 additions and 0 deletions

examples/pytorch/graphsage/experimental/train_dist.py examples/pytorch/graphsage/experimental/train_dist.py +26 -0

No files found.
--- a/examples/pytorch/graphsage/experimental/train_dist.py
+++ b/examples/pytorch/graphsage/experimental/train_dist.py
@@ -155,9 +155,35 @@ def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device):
    model.train()
    return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid])

+def pad_data(nids):
+    """
+    In distributed traning scenario, we need to make sure that each worker has same number of
+    batches. Otherwise the synchronization(barrier) is called diffirent times, which results in
+    the worker with more batches hangs up.
+
+    This function pads the nids to the same size for all workers, by repeating the head ids till
+    the maximum size among all workers.
+    """
+    import torch.distributed as dist
+    num_nodes = th.tensor(nids.numel())
+    dist.all_reduce(num_nodes, dist.ReduceOp.MAX)
+    max_num_nodes = int(num_nodes)
+    nids_length = nids.shape[0]
+    if max_num_nodes > nids_length:
+        pad_size = max_num_nodes % nids_length
+        repeat_size = max_num_nodes // nids_length
+        new_nids = th.cat([nids for _ in repeat_size] + [nids[:pad_size]], axis=0)
+        print("Pad nids from {} to {}".format(nids_length, max_num_nodes))
+    else:
+        new_nids = nids
+    assert new_nids.shape[0] == max_num_nodes
+    return new_nids
+
+
 def run(args, device, data):
    # Unpack data
    train_nid, val_nid, test_nid, in_feats, n_classes, g = data
+    train_nid = pad_data(train_nid)
    # Create sampler
    sampler = NeighborSampler(g, [int(fanout) for fanout in args.fan_out.split(',')],
                              dgl.distributed.sample_neighbors, device)