"git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "8bc4ab06c4d249cc9ef978af9c7d0770a5d0b444"
Unverified Commit 172949d4 authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

[Example] Fix multi-GPU RGCN example (#3871)



* fix multi-gpu rgcn example

* remove dgl.multiprocessing in turorials

* add a comment
Co-authored-by: default avatarJinjing Zhou <VoVAllen@users.noreply.github.com>
parent 61edb798
...@@ -31,12 +31,13 @@ def collect_eval(n_gpus, queue, labels): ...@@ -31,12 +31,13 @@ def collect_eval(n_gpus, queue, labels):
def run(proc_id, n_gpus, n_cpus, args, devices, dataset, queue=None): def run(proc_id, n_gpus, n_cpus, args, devices, dataset, queue=None):
dev_id = devices[proc_id] dev_id = devices[proc_id]
th.cuda.set_device(dev_id)
g, num_rels, num_classes, labels, train_idx, test_idx,\ g, num_rels, num_classes, labels, train_idx, test_idx,\
target_idx, inv_target = dataset target_idx, inv_target = dataset
dist_init_method = 'tcp://{master_ip}:{master_port}'.format( dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
master_ip='127.0.0.1', master_port='12345') master_ip='127.0.0.1', master_port='12345')
backend = 'gloo' backend = 'nccl'
if proc_id == 0: if proc_id == 0:
print("backend using {}".format(backend)) print("backend using {}".format(backend))
th.distributed.init_process_group(backend=backend, th.distributed.init_process_group(backend=backend,
...@@ -101,6 +102,8 @@ def main(args, devices): ...@@ -101,6 +102,8 @@ def main(args, devices):
g.create_formats_() g.create_formats_()
n_gpus = len(devices) n_gpus = len(devices)
# required for mp.Queue() to work with mp.spawn()
mp.set_start_method('spawn')
n_cpus = mp.cpu_count() n_cpus = mp.cpu_count()
queue = mp.Queue(n_gpus) queue = mp.Queue(n_gpus)
mp.spawn(run, args=(n_gpus, n_cpus // n_gpus, args, devices, data, queue), mp.spawn(run, args=(n_gpus, n_cpus // n_gpus, args, devices, data, queue),
......
...@@ -206,19 +206,6 @@ def run(proc_id, devices): ...@@ -206,19 +206,6 @@ def run(proc_id, devices):
# #
# A typical scenario for multi-GPU training with DDP is to replicate the # A typical scenario for multi-GPU training with DDP is to replicate the
# model once per GPU, and spawn one trainer process per GPU. # model once per GPU, and spawn one trainer process per GPU.
#
# PyTorch tutorials recommend using ``multiprocessing.spawn`` to spawn
# multiple processes. This however is undesirable for training node
# classification or link prediction models on a single large graph,
# especially on Linux. The reason is that a single large graph itself may
# take a lot of memory, and ``mp.spawn`` will duplicate all objects in the
# program, including the large graph. Consequently, the large graph will
# be duplicated as many times as the number of GPUs.
#
# To alleviate the problem we recommend using ``multiprocessing.Process``,
# which *forks* from the main process and allows sharing the same graph
# object to trainer processes via *copy-on-write*. This can greatly reduce
# the memory consumption.
# #
# Normally, DGL maintains only one sparse matrix representation (usually COO) # Normally, DGL maintains only one sparse matrix representation (usually COO)
# for each graph, and will create new formats when some APIs are called for # for each graph, and will create new formats when some APIs are called for
...@@ -238,12 +225,6 @@ graph.create_formats_() ...@@ -238,12 +225,6 @@ graph.create_formats_()
###################################################################### ######################################################################
# Then you can spawn the subprocesses to train with multiple GPUs. # Then you can spawn the subprocesses to train with multiple GPUs.
# #
# .. note::
#
# You will need to use ``dgl.multiprocessing`` instead of the Python
# ``multiprocessing`` package. ``dgl.multiprocessing`` is identical to
# Python’s built-in ``multiprocessing`` except that it handles the
# subtleties between forking and multithreading in Python.
# #
# .. code:: python # .. code:: python
# #
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment