"src/vscode:/vscode.git/clone" did not exist on "4625f04bc04bed43c1ba9b821149af121b5965ea"
Unverified Commit 7d416086 authored by Quan (Andy) Gan's avatar Quan (Andy) Gan Committed by GitHub
Browse files

[Bug] Fix multiple issues in distributed multi-GPU GraphSAGE example (#3870)



* fix distributed multi-GPU example device

* try Join

* update version requirement in README

* use model.join

* fix docs
Co-authored-by: default avatarJinjing Zhou <VoVAllen@users.noreply.github.com>
parent 16409ff8
......@@ -53,6 +53,7 @@ are the same as :ref:`mini-batch training <guide-minibatch>`.
# training loop
for epoch in range(args.num_epochs):
with model.join():
for step, blocks in enumerate(dataloader):
batch_inputs, batch_labels = load_subtensor(g, blocks[0].srcdata[dgl.NID],
blocks[-1].dstdata[dgl.NID])
......
......@@ -6,6 +6,8 @@ This is an example of training GraphSage in a distributed fashion. Before traini
sudo pip3 install ogb
```
**Requires PyTorch 1.10.0+ to work.**
To train GraphSage, it has five steps:
### Step 0: Setup a Distributed File System
......
......@@ -20,6 +20,7 @@ import torch.nn.functional as F
import torch.optim as optim
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
import socket
def load_subtensor(g, seeds, input_nodes, device, load_feat=True):
"""
......@@ -155,41 +156,11 @@ def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device):
model.train()
return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid])
def pad_data(nids, device):
"""
In distributed traning scenario, we need to make sure that each worker has same number of
batches. Otherwise the synchronization(barrier) is called diffirent times, which results in
the worker with more batches hangs up.
This function pads the nids to the same size for all workers, by repeating the head ids till
the maximum size among all workers.
"""
import torch.distributed as dist
# NCCL backend only supports GPU tensors, thus here we need to allocate it to gpu
num_nodes = th.tensor(nids.numel()).to(device)
dist.all_reduce(num_nodes, dist.ReduceOp.MAX)
max_num_nodes = int(num_nodes)
nids_length = nids.shape[0]
if max_num_nodes > nids_length:
pad_size = max_num_nodes % nids_length
repeat_size = max_num_nodes // nids_length
new_nids = th.cat([nids for _ in range(repeat_size)] + [nids[:pad_size]], axis=0)
print("Pad nids from {} to {}".format(nids_length, max_num_nodes))
else:
new_nids = nids
assert new_nids.shape[0] == max_num_nodes
return new_nids
def run(args, device, data):
# Unpack data
train_nid, val_nid, test_nid, in_feats, n_classes, g = data
shuffle = True
if args.pad_data:
train_nid = pad_data(train_nid, device)
# Current pipeline doesn't support duplicate node id within the same batch
# Therefore turn off shuffling to avoid potential duplicate node id within the same batch
shuffle = False
# Create sampler
sampler = NeighborSampler(g, [int(fanout) for fanout in args.fan_out.split(',')],
dgl.distributed.sample_neighbors, device)
......@@ -209,8 +180,7 @@ def run(args, device, data):
if args.num_gpus == -1:
model = th.nn.parallel.DistributedDataParallel(model)
else:
dev_id = g.rank() % args.num_gpus
model = th.nn.parallel.DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)
model = th.nn.parallel.DistributedDataParallel(model, device_ids=[device], output_device=device)
loss_fcn = nn.CrossEntropyLoss()
loss_fcn = loss_fcn.to(device)
optimizer = optim.Adam(model.parameters(), lr=args.lr)
......@@ -233,6 +203,8 @@ def run(args, device, data):
# Loop over the dataloader to sample the computation dependency graph as a list of
# blocks.
step_time = []
with model.join():
for step, blocks in enumerate(dataloader):
tic_step = time.time()
sample_time += tic_step - start
......@@ -249,6 +221,7 @@ def run(args, device, data):
batch_labels = batch_labels.to(device)
# Compute loss and prediction
start = time.time()
#print(g.rank(), blocks[0].device, model.module.layers[0].fc_neigh.weight.device, dev_id)
batch_pred = model(blocks, batch_inputs)
loss = loss_fcn(batch_pred, batch_labels)
forward_end = time.time()
......@@ -285,11 +258,14 @@ def run(args, device, data):
time.time() - start))
def main(args):
print(socket.gethostname(), 'Initializing DGL dist')
dgl.distributed.initialize(args.ip_config)
if not args.standalone:
print(socket.gethostname(), 'Initializing DGL process group')
th.distributed.init_process_group(backend=args.backend)
print(socket.gethostname(), 'Initializing DistGraph')
g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
print('rank:', g.rank())
print(socket.gethostname(), 'rank:', g.rank())
pb = g.get_partition_book()
if 'trainer_id' in g.ndata:
......@@ -311,7 +287,8 @@ def main(args):
if args.num_gpus == -1:
device = th.device('cpu')
else:
device = th.device('cuda:'+str(args.local_rank))
dev_id = g.rank() % args.num_gpus
device = th.device('cuda:'+str(dev_id))
labels = g.ndata['labels'][np.arange(g.number_of_nodes())]
n_classes = len(th.unique(labels[th.logical_not(th.isnan(labels))]))
print('#labels:', n_classes)
......
......@@ -299,6 +299,7 @@ The training loop for distributed training is also exactly the same as the singl
for epoch in range(10):
# Loop over the dataloader to sample mini-batches.
losses = []
with model.join():
for step, (input_nodes, seeds, blocks) in enumerate(train_dataloader):
# Load the input features as well as output labels
batch_inputs = g.ndata['feat'][input_nodes]
......@@ -315,7 +316,7 @@ The training loop for distributed training is also exactly the same as the singl
# validation
predictions = []
labels = []
with th.no_grad():
with th.no_grad(), model.join():
for step, (input_nodes, seeds, blocks) in enumerate(valid_dataloader):
inputs = g.ndata['feat'][input_nodes]
labels.append(g.ndata['labels'][seeds].numpy())
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment