[Example] Fix multi-GPU RGCN example (#3871)

* fix multi-gpu rgcn example * remove dgl.multiprocessing in turorials * add a comment Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>

[Example] Fix multi-GPU RGCN example (#3871)
* fix multi-gpu rgcn example * remove dgl.multiprocessing in turorials * add a comment Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
172949d4 · Xin Yao · GitHub · 61edb798 · 172949d4 · 172949d4
Unverified Commit 172949d4 authored Mar 24, 2022 by Xin Yao Committed by GitHub Mar 23, 2022
Showing with 4 additions and 20 deletions

examples/pytorch/rgcn/entity_sample_multi_gpu.py examples/pytorch/rgcn/entity_sample_multi_gpu.py +4 -1

tutorials/multi/2_node_classification.py tutorials/multi/2_node_classification.py +0 -19

No files found.
--- a/examples/pytorch/rgcn/entity_sample_multi_gpu.py
+++ b/examples/pytorch/rgcn/entity_sample_multi_gpu.py
@@ -31,12 +31,13 @@ def collect_eval(n_gpus, queue, labels):
 def run(proc_id, n_gpus, n_cpus, args, devices, dataset, queue=None):
    dev_id = devices[proc_id]
+    th.cuda.set_device(dev_id)
    g, num_rels, num_classes, labels, train_idx, test_idx,\
        target_idx, inv_target = dataset
    dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
        master_ip='127.0.0.1', master_port='12345')
-    backend = 'gloo'
+    backend = 'nccl'
    if proc_id == 0:
        print("backend using {}".format(backend))
    th.distributed.init_process_group(backend=backend,
@@ -101,6 +102,8 @@ def main(args, devices):
    g.create_formats_()
    n_gpus = len(devices)
+    # required for mp.Queue() to work with mp.spawn()
+    mp.set_start_method('spawn')
    n_cpus = mp.cpu_count()
    queue = mp.Queue(n_gpus)
    mp.spawn(run, args=(n_gpus, n_cpus // n_gpus, args, devices, data, queue),

--- a/tutorials/multi/2_node_classification.py
+++ b/tutorials/multi/2_node_classification.py
@@ -206,19 +206,6 @@ def run(proc_id, devices):
 # 
 # A typical scenario for multi-GPU training with DDP is to replicate the
 # model once per GPU, and spawn one trainer process per GPU.
-# 
-# PyTorch tutorials recommend using ``multiprocessing.spawn`` to spawn
-# multiple processes. This however is undesirable for training node
-# classification or link prediction models on a single large graph,
-# especially on Linux. The reason is that a single large graph itself may
-# take a lot of memory, and ``mp.spawn`` will duplicate all objects in the
-# program, including the large graph. Consequently, the large graph will
-# be duplicated as many times as the number of GPUs.
-# 
-# To alleviate the problem we recommend using ``multiprocessing.Process``,
-# which *forks* from the main process and allows sharing the same graph
-# object to trainer processes via *copy-on-write*. This can greatly reduce
-# the memory consumption.
 #
 # Normally, DGL maintains only one sparse matrix representation (usually COO)
 # for each graph, and will create new formats when some APIs are called for
@@ -238,12 +225,6 @@ graph.create_formats_()
 ######################################################################
 # Then you can spawn the subprocesses to train with multiple GPUs.
 # 
-# .. note::
-# 
-#    You will need to use ``dgl.multiprocessing`` instead of the Python
-#    ``multiprocessing`` package. ``dgl.multiprocessing`` is identical to
-#    Python’s built-in ``multiprocessing`` except that it handles the
-#    subtleties between forking and multithreading in Python.
 # 
 # .. code:: python
 #