[Bug Fix] Fix several sparse optimizer bugs (#2596)

* Fix pytorch TCP kvstore bug * lint * Fix * upd * Fix lint * Fix * trigger * fix Co-authored-by: Ubuntu <ubuntu@ip-172-31-56-220.ec2.internal>

[Bug Fix] Fix several sparse optimizer bugs (#2596)
* Fix pytorch TCP kvstore bug * lint * Fix * upd * Fix lint * Fix * trigger * fix Co-authored-by: Ubuntu <ubuntu@ip-172-31-56-220.ec2.internal>
23afe911 · xiang song(charlie.song) · GitHub · 2f71bc50 · 23afe911 · 23afe911
Unverified Commit 23afe911 authored Feb 05, 2021 by xiang song(charlie.song) Committed by GitHub Feb 05, 2021
4 changed files
--- a/examples/pytorch/rgcn/README.md
+++ b/examples/pytorch/rgcn/README.md
@@ -68,7 +68,7 @@ python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-siz
 OGBN-MAG without node-feats 42.79
 ```
-python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --dgl-sparse --sparse-lr 0.0
+python3 entity_classify_mp.py -d ogbn-mag --testing --fanout='30,30' --batch-size 1024 --n-hidden 128 --lr 0.01 --num-worker 4 --eval-batch-size 8 --low-mem --gpu 0,1,2,3 --dropout 0.7 --use-self-loop --n-bases 2 --n-epochs 3 --dgl-sparse --sparse-lr 0.08
 ```
 Test-bd: P2-8xlarge

--- a/examples/pytorch/rgcn/entity_classify_mp.py
+++ b/examples/pytorch/rgcn/entity_classify_mp.py
@@ -324,6 +324,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
    validation_time = 0
    test_time = 0
    last_val_acc = 0.0
+    do_test = False
    if n_gpus > 1 and n_cpus - args.num_workers > 0:
        th.set_num_threads(n_cpus-args.num_workers)
    for epoch in range(args.n_epochs):
@@ -405,7 +406,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
        vend = time.time()
        validation_time += (vend - vstart)
-        if (epoch + 1) > (args.n_epochs / 2) and do_test:
+        if epoch > 0 and do_test:
            tstart = time.time()
            if (queue is not None) or (proc_id == 0):
                test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats)

--- a/python/dgl/nn/pytorch/sparse_emb.py
+++ b/python/dgl/nn/pytorch/sparse_emb.py
@@ -75,16 +75,15 @@ class NodeEmbedding: # NodeEmbedding
            emb = create_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
            if init_func is not None:
                emb = init_func(emb)
-        if rank == 0:
+        if rank == 0: # the master gpu process
-            if world_size > 1:
+            # for multi-gpu training, setup a TCPStore for
-                # for multi-gpu training, setup a TCPStore for
+            # embeding status synchronization across GPU processes
-                # embeding status synchronization across GPU processes
+            if _STORE is None:
-                if _STORE is None:
+                _STORE = th.distributed.TCPStore(
-                    _STORE = th.distributed.TCPStore(
+                    host_name, port, world_size, True, timedelta(seconds=30))
-                        host_name, port, world_size, True, timedelta(seconds=30))
+            for _ in range(1, world_size):
-                for _ in range(1, world_size):
+                # send embs
-                    # send embs
+                _STORE.set(name, name)
-                    _STORE.set(name, name)
        elif rank > 0:
            # receive
            if _STORE is None:

--- a/python/dgl/optim/pytorch/sparse_optim.py
+++ b/python/dgl/optim/pytorch/sparse_optim.py
 """Node embedding optimizers"""
 import abc
 from abc import abstractmethod
+import gc
 import torch as th
 from ...utils import get_shared_mem_array, create_shared_mem_array
@@ -25,6 +26,34 @@ class SparseGradOptimizer(abc.ABC):
        self._world_size = None
        self._shared_cache = {}
        self._clean_grad = False
+        self._opt_meta = {}
+        for emb in params:
+            assert isinstance(emb, NodeEmbedding), \
+                'DGL SparseOptimizer only supports dgl.nn.NodeEmbedding'
+            if self._rank is None:
+                self._rank = emb.rank
+                self._world_size = emb.world_size
+            else:
+                assert self._rank == emb.rank, \
+                    'MultiGPU rank for each embedding should be same.'
+                assert self._world_size == emb.world_size, \
+                    'MultiGPU world_size for each embedding should be same.'
+            emb_name = emb.name
+            if self._rank == 0: # the master gpu process
+                opt_meta = create_shared_mem_array(emb_name+'_opt_meta', \
+                    (self._world_size, self._world_size), th.int32).zero_()
+            if self._rank == 0:
+                emb.store.set(emb_name+'_opt_meta', emb_name)
+                self._opt_meta[emb_name] = opt_meta
+            elif self._rank > 0:
+                # receive
+                emb.store.wait([emb_name+'_opt_meta'])
+                opt_meta = get_shared_mem_array(emb_name+'_opt_meta', \
+                    (self._world_size, self._world_size), th.int32)
+                self._opt_meta[emb_name] = opt_meta
    def step(self):
        ''' The step function.
@@ -36,92 +65,127 @@ class SparseGradOptimizer(abc.ABC):
            # We cache shared memory buffers in shared_emb.
            shared_emb = {emb.name: ([], []) for emb in self._params}
+            # hold released shared memory to let other process to munmap it first
+            # unless it will crash the training
+            shmem_ptr_holder = []
            # Go through all sparse embeddings
            for emb in self._params: # pylint: disable=too-many-nested-blocks
-                num_embeddings = emb.num_embeddings
                emb_name = emb.name
-                # Each gpu process takes the resposibility of update a range of sparse embedding,
+                # we need to combine gradients from multiple forward paths
-                # thus we can parallel the gradient update.
+                idx = []
-                range_size = (num_embeddings + self._world_size - 1) // self._world_size \
+                grad = []
-                    if self._world_size > 0 else 0
+                for i, data in emb._trace:
-                for idx, data in emb._trace:
+                    idx.append(i)
-                    grad = data.grad.data
+                    grad.append(data.grad.data)
-                    device = grad.device
+                idx = th.cat(idx, dim=0)
-                    idx_dtype = idx.dtype
+                grad = th.cat(grad, dim=0)
-                    grad_dtype = grad.dtype
-                    grad_dim = grad.shape[1]
+                device = grad.device
+                idx_dtype = idx.dtype
-                    if self._world_size > 0:
+                grad_dtype = grad.dtype
-                        if emb_name not in self._shared_cache:
+                grad_dim = grad.shape[1]
-                            self._shared_cache[emb_name] = {}
+                if self._world_size > 1:
+                    if emb_name not in self._shared_cache:
-                        for i in range(self._world_size):
+                        self._shared_cache[emb_name] = {}
-                            start = i * range_size
-                            end = (i + 1) * range_size \
+                    # Each training process takes the resposibility of updating a range
-                                if (i + 1) * range_size < num_embeddings \
+                    # of node embeddings, thus we can parallel the gradient update.
-                                else num_embeddings
+                    # The overall progress includes:
-                            if i == 0:
+                    #   1. In each training process:
-                                mask = idx < end
+                    #     1.a Deciding which process a node embedding belongs to according
-                            elif i + 1 == self._world_size:
+                    #         to the formula: process_id = node_idx mod num_of_process(N)
-                                mask = idx >= start
+                    #     1.b Split the node index tensor and gradient tensor into N parts
-                            else:
+                    #         according to step 1.
-                                mask = th.logical_and((idx >= start), (idx < end))
+                    #     1.c Write each node index sub-tensor and gradient sub-tensor into
-                            idx_i = idx[mask]
+                    #         different DGL shared memory buffers.
-                            grad_i = grad[mask]
+                    #   2. Cross training process synchronization
+                    #   3. In each traning process:
-                            if i == self._rank:
+                    #     3.a Collect node index sub-tensors and gradient sub-tensors
-                                shared_emb[emb_name][0].append(idx_i)
+                    #     3.b Do gradient update
-                                shared_emb[emb_name][1].append(grad_i)
+                    #   4. Done
-                            else:
+                    idx_split = th.remainder(idx, self._world_size).long()
-                                # currently nccl does not support Alltoallv operation
+                    for i in range(self._world_size):
-                                # we need to use CPU shared memory to share gradient
+                        mask = idx_split == i
-                                # across processes
+                        idx_i = idx[mask]
-                                idx_i = idx_i.to(th.device('cpu'))
+                        grad_i = grad[mask]
-                                grad_i = grad_i.to(th.device('cpu'))
-                                idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, self._rank, i)
+                        if i == self._rank:
-                                grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, self._rank, i)
+                            shared_emb[emb_name][0].append(idx_i)
+                            shared_emb[emb_name][1].append(grad_i)
-                                if idx_shmem_name not in self._shared_cache[emb_name] or \
+                        else:
-                                    self._shared_cache[emb_name][idx_shmem_name].shape[0] \
+                            # currently nccl does not support Alltoallv operation
-                                        < idx_i.shape[0]:
+                            # we need to use CPU shared memory to share gradient
-                                    # in case idx_i.shape[0] is 0
+                            # across processes
-                                    idx_shmem = create_shared_mem_array(idx_shmem_name, \
+                            idx_i = idx_i.to(th.device('cpu'))
-                                        (idx_i.shape[0] * 2 + 2,), idx_dtype)
+                            grad_i = grad_i.to(th.device('cpu'))
-                                    grad_shmem = create_shared_mem_array(grad_shmem_name, \
+                            idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, self._rank, i)
-                                        (idx_i.shape[0] * 2 + 2, grad_dim), grad_dtype)
+                            grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, self._rank, i)
-                                    self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
-                                    self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
+                            # Create shared memory to hold temporary index and gradient tensor for
+                            # cross-process send and recv.
-                                self._shared_cache[emb_name][idx_shmem_name][:idx_i.shape[0]] \
+                            if idx_shmem_name not in self._shared_cache[emb_name] or \
-                                    = idx_i
+                                self._shared_cache[emb_name][idx_shmem_name].shape[0] \
-                                self._shared_cache[emb_name][grad_shmem_name][:idx_i.shape[0]] \
+                                    < idx_i.shape[0]:
-                                    = grad_i
-                                emb.store.set(idx_shmem_name, str(idx_i.shape[0]))
+                                if idx_shmem_name in self._shared_cache[emb_name]:
+                                    shmem_ptr_holder.append(
-                        # gather gradients from all other processes
+                                        self._shared_cache[emb_name][idx_shmem_name])
-                        for i in range(self._world_size):
+                                    shmem_ptr_holder.append(
-                            if i != self._rank:
+                                        self._shared_cache[emb_name][grad_shmem_name])
-                                idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, i, self._rank)
-                                grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, i, self._rank)
+                                # in case idx_i.shape[0] is 0
-                                size = int(emb.store.get(idx_shmem_name))
+                                idx_shmem = create_shared_mem_array(idx_shmem_name, \
-                                if idx_shmem_name not in self._shared_cache[emb_name] or \
+                                    (idx_i.shape[0] * 2 + 2,), idx_dtype)
-                                    self._shared_cache[emb_name][idx_shmem_name].shape[0] < size:
+                                grad_shmem = create_shared_mem_array(grad_shmem_name, \
-                                    idx_shmem = get_shared_mem_array(idx_shmem_name, \
+                                    (idx_i.shape[0] * 2 + 2, grad_dim), grad_dtype)
-                                        (size * 2 + 2,), idx_dtype)
+                                self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
-                                    grad_shmem = get_shared_mem_array(grad_shmem_name, \
+                                self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
-                                        (size * 2 + 2, grad_dim), grad_dtype)
-                                    self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
+                            # Fill shared memory with temporal index tensor and gradient tensor
-                                    self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
+                            self._shared_cache[emb_name][idx_shmem_name][:idx_i.shape[0]] \
-                                idx_i = self._shared_cache[emb_name][idx_shmem_name][:size]
+                                = idx_i
-                                grad_i = self._shared_cache[emb_name][grad_shmem_name][:size]
+                            self._shared_cache[emb_name][grad_shmem_name][:idx_i.shape[0]] \
-                                shared_emb[emb_name][0].append(idx_i.to(device,
+                                = grad_i
-                                                                        non_blocking=True))
+                            self._opt_meta[emb_name][self._rank][i] = idx_i.shape[0]
-                                shared_emb[emb_name][1].append(grad_i.to(device,
+                else:
-                                                                         non_blocking=True))
+                    shared_emb[emb_name][0].append(idx)
-                    else:
+                    shared_emb[emb_name][1].append(grad)
-                        shared_emb[emb_name][0].append(idx)
-                        shared_emb[emb_name][1].append(grad)
+            # make sure the idx shape is passed to each process through opt_meta
+            if self._world_size > 1:
+                th.distributed.barrier()
+            for emb in self._params: # pylint: disable=too-many-nested-blocks
+                emb_name = emb.name
+                if self._world_size > 1:
+                    # gather gradients from all other processes
+                    for i in range(self._world_size):
+                        if i != self._rank:
+                            idx_shmem_name = 'idx_{}_{}_{}'.format(emb_name, i, self._rank)
+                            grad_shmem_name = 'grad_{}_{}_{}'.format(emb_name, i, self._rank)
+                            size = self._opt_meta[emb_name][i][self._rank]
+                            # Retrive shared memory holding the temporal index and gradient
+                            # tensor that is sent to current training process
+                            if idx_shmem_name not in self._shared_cache[emb_name] or \
+                                self._shared_cache[emb_name][idx_shmem_name].shape[0] < size:
+                                idx_shmem = get_shared_mem_array(idx_shmem_name, \
+                                    (size * 2 + 2,), idx_dtype)
+                                grad_shmem = get_shared_mem_array(grad_shmem_name, \
+                                    (size * 2 + 2, grad_dim), grad_dtype)
+                                self._shared_cache[emb_name][idx_shmem_name] = idx_shmem
+                                self._shared_cache[emb_name][grad_shmem_name] = grad_shmem
+                                # make sure shared memory are released in child process first
+                                # This will not be called frequently
+                                # TODO(xiangsx) Provide API to mumap shared memory directly
+                                gc.collect()
+                            idx_i = self._shared_cache[emb_name][idx_shmem_name][:size]
+                            grad_i = self._shared_cache[emb_name][grad_shmem_name][:size]
+                            shared_emb[emb_name][0].append(idx_i.to(device,
+                                                                    non_blocking=True))
+                            shared_emb[emb_name][1].append(grad_i.to(device,
+                                                                     non_blocking=True))
            if self._clean_grad:
                # clean gradient track
@@ -205,21 +269,12 @@ class SparseAdagrad(SparseGradOptimizer):
            assert isinstance(emb, NodeEmbedding), \
                'SparseAdagrad only supports dgl.nn.NodeEmbedding'
-            if self._rank is None:
-                self._rank = emb.rank
-                self._world_size = emb.world_size
-            else:
-                assert self._rank == emb.rank, \
-                    'MultiGPU rank for each embedding should be same.'
-                assert self._world_size == emb.world_size, \
-                    'MultiGPU world_size for each embedding should be same.'
            if self._rank <= 0:
                emb_name = emb.name
                state = create_shared_mem_array(emb_name+'_state', \
                    emb.emb_tensor.shape, th.float32).zero_()
            if self._rank == 0:
-                for _ in range(1, world_size):
+                if self._world_size > 1:
-                    # send embs
                    emb.store.set(emb_name+'_opt', emb_name)
            elif self._rank > 0:
                # receive
@@ -318,14 +373,6 @@ class SparseAdam(SparseGradOptimizer):
            assert isinstance(emb, NodeEmbedding), \
                'SparseAdam only supports dgl.nn.NodeEmbedding'
-            if self._rank is None:
-                self._rank = emb.rank
-                self._world_size = emb.world_size
-            else:
-                assert self._rank == emb.rank, \
-                    'MultiGPU rank for each embedding should be same.'
-                assert self._world_size == emb.world_size, \
-                    'MultiGPU world_size for each embedding should be same.'
            if self._rank <= 0:
                emb_name = emb.name
                state_step = create_shared_mem_array(emb_name+'_step', \
@@ -335,10 +382,8 @@ class SparseAdam(SparseGradOptimizer):
                state_power = create_shared_mem_array(emb_name+'_power', \
                    emb.emb_tensor.shape, th.float32).zero_()
            if self._rank == 0:
-                state = (state_step, state_mem, state_power)
                emb_name = emb.name
-                for _ in range(1, self._world_size):
+                if self._world_size > 1:
-                    # send embs
                    emb.store.set(emb_name+'_opt', emb_name)
            elif self._rank > 0:
                # receive