[Feature] Allow using NCCL for communication in dgl.NodeEmbedding and dgl.SparseOptimizer (#2824)

* Split from NCCL PR * Fix type in comment * Expand documentation for sparse_all_to_all_push * Restore previous behavior in example * Re-work optimizer to use NCCL based on gradient location * Allow for running with embedding on CPU but using NCCL for gradient exchange * Optimize single partition case * Fix pylint errors * Add missing include * fix gradient indexing * Fix line continuation * Migrate 'first_step' * Skip tests without enough GPUs to run NCCL * Improve empty tensor handling for pytorch 1.5 * Fix indentation * Allow multiple NCCL communicator to coexist * Improve handling of empty message * Update python/dgl/nn/pytorch/sparse_emb.py Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com> * Update python/dgl/nn/pytorch/sparse_emb.py Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com> * Keepy empty tensor dimensionaless * th.empty -> th.tensor * Preserve shape for empty non-zero dimension tensors * Use shared state, when embedding is shared * Add support for gathering an embedding * Fix typo * Fix more typos * Fix backend call * Use NodeDataLoader to take advantage of ddp * Update training script to share memory * Only squeeze last dimension * Better handle empty message * Keep embedding on the target device GPU if dgl_sparse if false in RGCN example * Fix typo in comment * Add asserts * Improve documentation in example Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com>

[Feature] Allow using NCCL for communication in dgl.NodeEmbedding and dgl.SparseOptimizer (#2824)
* Split from NCCL PR * Fix type in comment * Expand documentation for sparse_all_to_all_push * Restore previous behavior in example * Re-work optimizer to use NCCL based on gradient location * Allow for running with embedding on CPU but using NCCL for gradient exchange * Optimize single partition case * Fix pylint errors * Add missing include * fix gradient indexing * Fix line continuation * Migrate 'first_step' * Skip tests without enough GPUs to run NCCL * Improve empty tensor handling for pytorch 1.5 * Fix indentation * Allow multiple NCCL communicator to coexist * Improve handling of empty message * Update python/dgl/nn/pytorch/sparse_emb.py Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com> * Update python/dgl/nn/pytorch/sparse_emb.py Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com> * Keepy empty tensor dimensionaless * th.empty -> th.tensor * Preserve shape for empty non-zero dimension tensors * Use shared state, when embedding is shared * Add support for gathering an embedding * Fix typo * Fix more typos * Fix backend call * Use NodeDataLoader to take advantage of ddp * Update training script to share memory * Only squeeze last dimension * Better handle empty message * Keep embedding on the target device GPU if dgl_sparse if false in RGCN example * Fix typo in comment * Add asserts * Improve documentation in example Co-authored-by: xiang song(charlie.song) <classicxsong@gmail.com>
17d604b5 · nv-dlasalle · GitHub · 9497a9be · 17d604b5 · 17d604b5
Unverified Commit 17d604b5 authored Jun 10, 2021 by nv-dlasalle Committed by GitHub Jun 10, 2021
14 changed files
--- a/examples/pytorch/rgcn/entity_classify_mp.py
+++ b/examples/pytorch/rgcn/entity_classify_mp.py
@@ -123,54 +123,7 @@ def gen_norm(g):
    norm = norm.unsqueeze(1)
    g.edata['norm'] = norm

-class NeighborSampler:
-    """Neighbor sampler
-    Parameters
-    ----------
-    g : DGLHeterograph
-        Full graph
-    target_idx : tensor
-        The target training node IDs in g
-    fanouts : list of int
-        Fanout of each hop starting from the seed nodes. If a fanout is None,
-        sample full neighbors.
-    """
-    def __init__(self, g, target_idx, fanouts):
-        self.g = g
-        self.target_idx = target_idx
-        self.fanouts = fanouts
-
-    """Do neighbor sample
-    Parameters
-    ----------
-    seeds :
-        Seed nodes
-    Returns
-    -------
-    tensor
-        Seed nodes, also known as target nodes
-    blocks
-        Sampled subgraphs
-    """
-    def sample_blocks(self, seeds):
-        blocks = []
-        etypes = []
-        norms = []
-        ntypes = []
-        seeds = th.tensor(seeds).long()
-        cur = self.target_idx[seeds]
-        for fanout in self.fanouts:
-            if fanout is None or fanout == -1:
-                frontier = dgl.in_subgraph(self.g, cur)
-            else:
-                frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout)
-            block = dgl.to_block(frontier, cur)
-            gen_norm(block)
-            cur = block.srcdata[dgl.NID]
-            blocks.insert(0, block)
-        return seeds, blocks
-
-def evaluate(model, embed_layer, eval_loader, node_feats):
+def evaluate(model, embed_layer, eval_loader, node_feats, inv_target):
    model.eval()
    embed_layer.eval()
    eval_logits = []
@@ -179,7 +132,11 @@ def evaluate(model, embed_layer, eval_loader, node_feats):
    with th.no_grad():
        th.cuda.empty_cache()
        for sample_data in tqdm.tqdm(eval_loader):
-            seeds, blocks = sample_data
+            inputs, seeds, blocks = sample_data
+            seeds = inv_target[seeds]
+
+            for block in blocks:
+                gen_norm(block)

            feats = embed_layer(blocks[0].srcdata[dgl.NID],
                                blocks[0].srcdata['ntype'],
@@ -197,7 +154,7 @@ def evaluate(model, embed_layer, eval_loader, node_feats):
 def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
    dev_id = devices[proc_id] if devices[proc_id] != 'cpu' else -1
    g, node_feats, num_of_ntype, num_classes, num_rels, target_idx, \
-        train_idx, val_idx, test_idx, labels = dataset
+        inv_target, train_idx, val_idx, test_idx, labels = dataset
    if split is not None:
        train_seed, val_seed, test_seed = split
        train_idx = train_idx[train_seed]
@@ -206,28 +163,6 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):

    fanouts = [int(fanout) for fanout in args.fanout.split(',')]
    node_tids = g.ndata[dgl.NTYPE]
-    sampler = NeighborSampler(g, target_idx, fanouts)
-    loader = DataLoader(dataset=train_idx.numpy(),
-                        batch_size=args.batch_size,
-                        collate_fn=sampler.sample_blocks,
-                        shuffle=True,
-                        num_workers=args.num_workers)
-
-    # validation sampler
-    val_sampler = NeighborSampler(g, target_idx, fanouts)
-    val_loader = DataLoader(dataset=val_idx.numpy(),
-                            batch_size=args.batch_size,
-                            collate_fn=val_sampler.sample_blocks,
-                            shuffle=False,
-                            num_workers=args.num_workers)
-
-    # test sampler
-    test_sampler = NeighborSampler(g, target_idx, [None] * args.n_layers)
-    test_loader = DataLoader(dataset=test_idx.numpy(),
-                             batch_size=args.eval_batch_size,
-                             collate_fn=test_sampler.sample_blocks,
-                             shuffle=False,
-                             num_workers=args.num_workers)

    world_size = n_gpus
    if n_gpus > 1:
@@ -235,19 +170,56 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
            master_ip='127.0.0.1', master_port='12345')
        backend = 'nccl'

-        # using sparse embedding or usig mix_cpu_gpu model (embedding model can not be stored in GPU)
-        if args.dgl_sparse is False:
+        # using sparse embedding or using mix_cpu_gpu model (embedding model can not be stored in GPU)
+        if dev_id < 0 or args.dgl_sparse is False:
            backend = 'gloo'
        print("backend using {}".format(backend))
        th.distributed.init_process_group(backend=backend,
                                          init_method=dist_init_method,
                                          world_size=world_size,
-                                          rank=dev_id)
+                                          rank=proc_id)
+
+
+
+    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
+    loader = dgl.dataloading.NodeDataLoader(
+        g,
+        target_idx[train_idx],
+        sampler,
+        use_ddp=n_gpus > 1,
+        batch_size=args.batch_size,
+        shuffle=True,
+        drop_last=False,
+        num_workers=args.num_workers)
+
+    # validation sampler
+    val_loader = dgl.dataloading.NodeDataLoader(
+        g,
+        target_idx[val_idx],
+        sampler,
+        use_ddp=n_gpus > 1,
+        batch_size=args.batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=args.num_workers)
+
+    # test sampler
+    test_sampler = dgl.dataloading.MultiLayerNeighborSampler([None] * args.n_layers)
+    test_loader = dgl.dataloading.NodeDataLoader(
+        g,
+        target_idx[test_idx],
+        test_sampler,
+        use_ddp=n_gpus > 1,
+        batch_size=args.eval_batch_size,
+        shuffle=False,
+        drop_last=False,
+        num_workers=args.num_workers)

    # node features
    # None for one-hot feature, if not none, it should be the feature tensor.
    #
-    embed_layer = RelGraphEmbedLayer(dev_id,
+    embed_layer = RelGraphEmbedLayer(dev_id if args.embedding_gpu or not args.dgl_sparse else -1,
+                                     dev_id,
                                     g.number_of_nodes(),
                                     node_tids,
                                     num_of_ntype,
@@ -279,7 +251,8 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):

    if n_gpus > 1:
        labels = labels.to(dev_id)
-        model.cuda(dev_id)
+        if dev_id >= 0:
+            model.cuda(dev_id)
        model = DistributedDataParallel(model, device_ids=[dev_id], output_device=dev_id)
        if args.dgl_sparse:
            embed_layer.cuda(dev_id)
@@ -331,7 +304,14 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
        embed_layer.train()

        for i, sample_data in enumerate(loader):
-            seeds, blocks = sample_data
+            input_nodes, seeds, blocks = sample_data
+            # map the seed nodes back to their type-specific ids, so that they
+            # can be used to look up their respective labels
+            seeds = inv_target[seeds]
+
+            for block in blocks:
+                gen_norm(block)
+
            t0 = time.time()
            feats = embed_layer(blocks[0].srcdata[dgl.NID],
                                blocks[0].srcdata['ntype'],
@@ -353,7 +333,7 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
            forward_time.append(t1 - t0)
            backward_time.append(t2 - t1)
            train_acc = th.sum(logits.argmax(dim=1) == labels[seeds]).item() / len(seeds)
-            if i % 100 and proc_id == 0:
+            if i % 100 == 0 and proc_id == 0:
                print("Train Accuracy: {:.4f} | Train Loss: {:.4f}".
                    format(train_acc, loss.item()))
        gc.collect()
@@ -379,7 +359,8 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):

        vstart = time.time()
        if (queue is not None) or (proc_id == 0):
-            val_logits, val_seeds = evaluate(model, embed_layer, val_loader, node_feats)
+            val_logits, val_seeds = evaluate(model, embed_layer, val_loader,
+                                             node_feats, inv_target)
            if queue is not None:
                queue.put((val_logits, val_seeds))

@@ -407,7 +388,9 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
        if epoch > 0 and do_test:
            tstart = time.time()
            if (queue is not None) or (proc_id == 0):
-                test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats)
+                test_logits, test_seeds = evaluate(model, embed_layer,
+                                                   test_loader, node_feats,
+                                                   inv_target)
                if queue is not None:
                    queue.put((test_logits, test_seeds))

@@ -532,6 +515,17 @@ def main(args, devices):
    train_idx.share_memory_()
    val_idx.share_memory_()
    test_idx.share_memory_()
+
+    # This is a graph with multiple node types, so we want a way to map
+    # our target node from their global node numberings, back to their
+    # numberings within their type. This is used when taking the nodes in a
+    # mini-batch, and looking up their type-specific labels
+    inv_target = th.empty(node_ids.shape,
+        dtype=node_ids.dtype)
+    inv_target.share_memory_()
+    inv_target[target_idx] = th.arange(0, target_idx.shape[0],
+                                       dtype=inv_target.dtype)
+
    # Create csr/coo/csc formats before launching training processes with multi-gpu.
    # This avoids creating certain formats in each sub-process, which saves momory and CPU.
    g.create_formats_()
@@ -542,12 +536,12 @@ def main(args, devices):
    if devices[0] == -1:
        run(0, 0, n_cpus, args, ['cpu'],
            (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
-             train_idx, val_idx, test_idx, labels), None, None)
+             inv_target, train_idx, val_idx, test_idx, labels), None, None)
    # gpu
    elif n_gpus == 1:
        run(0, n_gpus, n_cpus, args, devices,
            (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
-            train_idx, val_idx, test_idx, labels), None, None)
+             inv_target, train_idx, val_idx, test_idx, labels), None, None)
    # multi gpu
    else:
        queue = mp.Queue(n_gpus)
@@ -577,8 +571,10 @@ def main(args, devices):
                                         if (proc_id + 1) * tstseeds_per_proc < num_test_seeds \
                                         else num_test_seeds]
            p = mp.Process(target=run, args=(proc_id, n_gpus, n_cpus // n_gpus, args, devices,
-                                             (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
-                                             train_idx, val_idx, test_idx, labels),
+                                             (g, node_feats, num_of_ntype,
+                                              num_classes, num_rels, target_idx,
+                                              inv_target, train_idx, val_idx,
+                                              test_idx, labels),
                                             (proc_train_seeds, proc_valid_seeds, proc_test_seeds),
                                             queue))
            p.start()
@@ -626,6 +622,8 @@ def config():
            help="Whether use low mem RelGraphCov")
    parser.add_argument("--dgl-sparse", default=False, action='store_true',
            help='Use sparse embedding for node embeddings.')
+    parser.add_argument("--embedding-gpu", default=False, action='store_true',
+            help='Store the node embeddings on the GPU.')
    parser.add_argument('--node-feats', default=False, action='store_true',
            help='Whether use node features')
    parser.add_argument('--layer-norm', default=False, action='store_true',

--- a/examples/pytorch/rgcn/model.py
+++ b/examples/pytorch/rgcn/model.py
@@ -58,8 +58,10 @@ class RelGraphEmbedLayer(nn.Module):
    r"""Embedding layer for featureless heterograph.
    Parameters
    ----------
-    dev_id : int
-        Device to run the layer.
+    storage_dev_id : int
+        The device to store the weights of the layer.
+    out_dev_id : int
+        Device to return the output embeddings on.
    num_nodes : int
        Number of nodes.
    node_tides : tensor
@@ -75,7 +77,8 @@ class RelGraphEmbedLayer(nn.Module):
        If true, use dgl.nn.NodeEmbedding otherwise use torch.nn.Embedding
    """
    def __init__(self,
-                 dev_id,
+                 storage_dev_id,
+                 out_dev_id,
                 num_nodes,
                 node_tids,
                 num_of_ntype,
@@ -83,7 +86,9 @@ class RelGraphEmbedLayer(nn.Module):
                 embed_size,
                 dgl_sparse=False):
        super(RelGraphEmbedLayer, self).__init__()
-        self.dev_id = th.device(dev_id if dev_id >= 0 else 'cpu')
+        self.storage_dev_id = th.device( \
+            storage_dev_id if storage_dev_id >= 0 else 'cpu')
+        self.out_dev_id = th.device(out_dev_id if out_dev_id >= 0 else 'cpu')
        self.embed_size = embed_size
        self.num_nodes = num_nodes
        self.dgl_sparse = dgl_sparse
@@ -97,14 +102,16 @@ class RelGraphEmbedLayer(nn.Module):
            if isinstance(input_size[ntype], int):
                if dgl_sparse:
                    self.node_embeds[str(ntype)] = dgl.nn.NodeEmbedding(input_size[ntype], embed_size, name=str(ntype),
-                        init_func=initializer)
+                        init_func=initializer, device=self.storage_dev_id)
                else:
                    sparse_emb = th.nn.Embedding(input_size[ntype], embed_size, sparse=True)
+                    sparse_emb.cuda(self.storage_dev_id)
                    nn.init.uniform_(sparse_emb.weight, -1.0, 1.0)
                    self.node_embeds[str(ntype)] = sparse_emb
            else:
                input_emb_size = input_size[ntype].shape[1]
-                embed = nn.Parameter(th.Tensor(input_emb_size, self.embed_size))
+                embed = nn.Parameter(th.empty([input_emb_size, self.embed_size],
+                                              device=self.storage_dev_id))
                nn.init.xavier_uniform_(embed)
                self.embeds[str(ntype)] = embed

@@ -136,16 +143,24 @@ class RelGraphEmbedLayer(nn.Module):
        tensor
            embeddings as the input of the next layer
        """
-        tsd_ids = node_ids.to(self.dev_id)
-        embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.dev_id)
+        embeds = th.empty(node_ids.shape[0], self.embed_size, device=self.out_dev_id)
+
+        # transfer input to the correct device
+        type_ids = type_ids.to(self.storage_dev_id)
+        node_tids = node_tids.to(self.storage_dev_id)
+
+        # build locs first
+        locs = [None for i in range(self.num_of_ntype)]
+        for ntype in range(self.num_of_ntype):
+            locs[ntype] = (node_tids == ntype).nonzero().squeeze(-1)
        for ntype in range(self.num_of_ntype):
-            loc = node_tids == ntype
+            loc = locs[ntype]
            if isinstance(features[ntype], int):
                if self.dgl_sparse:
-                    embeds[loc] = self.node_embeds[str(ntype)](type_ids[loc], self.dev_id)
+                    embeds[loc] = self.node_embeds[str(ntype)](type_ids[loc], self.out_dev_id)
                else:
-                    embeds[loc] = self.node_embeds[str(ntype)](type_ids[loc]).to(self.dev_id)
+                    embeds[loc] = self.node_embeds[str(ntype)](type_ids[loc]).to(self.out_dev_id)
            else:
-                embeds[loc] = features[ntype][type_ids[loc]].to(self.dev_id) @ self.embeds[str(ntype)].to(self.dev_id)
+                embeds[loc] = features[ntype][type_ids[loc]].to(self.out_dev_id) @ self.embeds[str(ntype)].to(self.out_dev_id)

        return embeds
--- a/python/dgl/backend/pytorch/tensor.py
+++ b/python/dgl/backend/pytorch/tensor.py
@@ -338,6 +338,13 @@ def zerocopy_from_dgl_ndarray(data):
        #  The issue will be fixed in v1.6 and later.
        return th.tensor([], dtype=getattr(th, data.dtype),
                         device=to_backend_ctx(data.ctx))
+    elif len(data.shape) == 0 or builtins.min(data.shape) == 0:
+        # Workaround the same issue as above, but preserve the shape of the
+        # empty tensor. This is needed by the sparse optimizer when one of
+        # processors may receive no gradients to update, but we want to keep
+        # the dimension of the embedding.
+        return th.empty(data.shape, dtype=getattr(th, data.dtype),
+                        device=to_backend_ctx(data.ctx))
    else:
        return dlpack.from_dlpack(data.to_dlpack())


--- a/python/dgl/cuda/nccl.py
+++ b/python/dgl/cuda/nccl.py
@@ -88,7 +88,7 @@ class Communicator(object):
                The 1D set of indices to send to other processors.
            value : tensor
                The multi-dimension set of values to send to other processors.
-                The 0th dimension must match that of `idx`.
+                The first dimension must match that of `idx`.
            partition : NDArrayPartition
                The object containing information for assigning indices to
                processors.
@@ -137,7 +137,7 @@ class Communicator(object):

    def sparse_all_to_all_pull(self, req_idx, value, partition):
        """ Perform an all-to-all-v operation, where by all processors request
-            the values corresponding to ther set of indices.
+            the values corresponding to their set of indices.

            Parameters
            ----------

--- a/python/dgl/nn/pytorch/sparse_emb.py
+++ b/python/dgl/nn/pytorch/sparse_emb.py
@@ -3,8 +3,11 @@ from datetime import timedelta
 import torch as th
 from ...backend import pytorch as F
 from ...utils import get_shared_mem_array, create_shared_mem_array
+from ...cuda import nccl
+from ...partition import NDArrayPartition

 _STORE = None
+_COMM = None

 class NodeEmbedding: # NodeEmbedding
    '''Class for storing node embeddings.
@@ -36,6 +39,11 @@ class NodeEmbedding: # NodeEmbedding
    init_func : callable, optional
        The function to create the initial data. If the init function is not provided,
        the values of the embeddings are initialized to zero.
+    device : th.device
+        Device to store the embeddings on.
+    parittion : NDArrayPartition
+        The partition to use to distributed the embeddings between
+        processes.

    Examples
    --------
@@ -58,8 +66,12 @@ class NodeEmbedding: # NodeEmbedding
    '''

    def __init__(self, num_embeddings, embedding_dim, name,
-                 init_func=None):
+                 init_func=None, device=None, partition=None):
        global _STORE
+        global _COMM
+
+        if device is None:
+            device = th.device('cpu')

        # Check whether it is multi-gpu training or not.
        if th.distributed.is_initialized():
@@ -70,32 +82,76 @@ class NodeEmbedding: # NodeEmbedding
            world_size = 0
        self._rank = rank
        self._world_size = world_size
+        self._store = None
+        self._comm = None
+        self._partition = partition
+
        host_name = '127.0.0.1'
        port = 12346

-        if rank <= 0:
-            emb = create_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
-            if init_func is not None:
-                emb = init_func(emb)
-        if rank == 0: # the master gpu process
+        if rank >= 0:
            # for multi-gpu training, setup a TCPStore for
            # embeding status synchronization across GPU processes
            if _STORE is None:
                _STORE = th.distributed.TCPStore(
-                    host_name, port, world_size, True, timedelta(seconds=10*60))
-            for _ in range(1, world_size):
-                # send embs
-                _STORE.set(name, name)
-        elif rank > 0:
-            # receive
-            if _STORE is None:
-                _STORE = th.distributed.TCPStore(
-                    host_name, port, world_size, False, timedelta(seconds=10*60))
-            _STORE.wait([name])
-            emb = get_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
+                    host_name, port, world_size, rank == 0, timedelta(seconds=10*60))
+            self._store = _STORE
+
+        # embeddings is stored in CPU memory.
+        if th.device(device) == th.device('cpu'):
+            if rank <= 0:
+                emb = create_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
+                if init_func is not None:
+                    emb = init_func(emb)
+            if rank == 0: # the master gpu process
+                for _ in range(1, world_size):
+                    # send embs
+                    self._store.set(name, name)
+            elif rank > 0:
+                # receive
+                self._store.wait([name])
+                emb = get_shared_mem_array(name, (num_embeddings, embedding_dim), th.float32)
+            self._tensor = emb
+        else: # embeddings is stored in GPU memory.
+            # setup nccl communicator
+            if _COMM is None:
+                if rank < 0:
+                    _COMM = nccl.Communicator(1, 0, nccl.UniqueId())
+                else:
+                    # needs to be set for nccl to work
+                    th.cuda.set_device(device)
+                    if rank == 0:
+                        # root process broadcasts nccl id
+                        nccl_id = nccl.UniqueId()
+                        self._store.set('nccl_root_id', str(nccl_id))
+                    else:
+                        nccl_id = nccl.UniqueId(self._store.get('nccl_root_id'))
+                    _COMM = nccl.Communicator(self._world_size, self._rank,
+                                              nccl_id)
+                    if self._rank == 0:
+                        # clear the store entry for future communicators
+                        self._store.delete_key('nccl_root_id')
+                    th.distributed.barrier()
+
+            self._comm = _COMM
+
+            if not self._partition:
+                # for communication we need a partition
+                self._partition = NDArrayPartition(
+                    num_embeddings,
+                    self._world_size if self._world_size > 0 else 1,
+                    mode='remainder')
+
+            # create local tensors for the weights
+            local_size = self._partition.local_size(self._comm.rank())
+
+            # TODO(dlasalle): support 16-bit/half embeddings
+            emb = th.empty([local_size, embedding_dim], dtype=th.float32,
+                           requires_grad=False, device=device)
+            if init_func:
+                emb = init_func(emb)
+            self._tensor = emb

-        self._store = _STORE
-        self._tensor = emb
        self._num_embeddings = num_embeddings
        self._embedding_dim = embedding_dim
        self._name = name
@@ -109,10 +165,19 @@ class NodeEmbedding: # NodeEmbedding
        device : th.device
            Target device to put the collected embeddings.
        """
-        emb = self._tensor[node_ids].to(device)
+        if not self._comm or self._comm.size() == 1:
+            emb = self._tensor[node_ids].to(device)
+        else:
+            if self.world_size > 0:
+                emb = self._comm.sparse_all_to_all_pull(
+                    node_ids, self._tensor, self._partition)
+            else:
+                emb = self._tensor[node_ids]
+            emb = emb.to(device)
        if F.is_recording():
            emb = F.attach_grad(emb)
-            self._trace.append((node_ids.to(device, non_blocking=True), emb))
+            self._trace.append((node_ids.to(device), emb))
+
        return emb

    @property
@@ -127,6 +192,31 @@ class NodeEmbedding: # NodeEmbedding
        """
        return self._store

+    @property
+    def comm(self):
+        """Return dgl.cuda.nccl.Communicator for data
+        sharing across processes.
+
+        Returns
+        -------
+        dgl.cuda.nccl.Communicator
+            Communicator used for data sharing.
+        """
+        return self._comm
+
+    @property
+    def partition(self):
+        """Return the partition identifying how the tensor is split across
+        processes.
+
+        Returns
+        -------
+        String
+            The mode.
+        """
+
+        return self._partition
+
    @property
    def rank(self):
        """Return rank of current process.
@@ -244,3 +334,47 @@ class NodeEmbedding: # NodeEmbedding
            The tensor storing the node embeddings
        """
        return self._tensor
+
+    def gather_embedding(self):
+        """Return a copy of the embedding stored in CPU memory. If this is a
+        multi-processing instance, the tensor will be returned in shared
+        memory. If the embedding is currently stored on multiple GPUs, all
+        processes must call this method in the same order.
+
+        Returns
+        -------
+        torch.Tensor
+            The tensor storing the node embeddings.
+        """
+        if self._partition:
+            if self._world_size == 0:
+                # non-multiprocessing
+                return self._tensor.to(th.device('cpu'))
+            else:
+                # create a shared memory tensor
+                shared_name = self._name + "_gather"
+                if self._rank == 0:
+                    # root process creates shared memory
+                    emb = create_shared_mem_array(
+                        shared_name,
+                        (self._num_embeddings, self._embedding_dim),
+                        self._tensor.dtype)
+                    self._store.set(shared_name, shared_name)
+                else:
+                    self._store.wait([shared_name])
+                    emb = get_shared_mem_array(
+                        shared_name, (self._num_embeddings, self._embedding_dim),
+                        self._tensor.dtype)
+                # need to map indices and slice into existing tensor
+                idxs = self._partition.map_to_global(
+                    F.arange(0, self._tensor.shape[0],
+                             ctx=F.context(self._tensor)),
+                    self._rank).to(emb.device)
+                emb[idxs] = self._tensor.to(emb.device)
+
+                # wait for all processes to finish
+                th.distributed.barrier()
+                return emb
+        else:
+            # already stored in CPU memory
+            return self._tensor
--- a/python/dgl/optim/pytorch/sparse_optim.py
+++ b/python/dgl/optim/pytorch/sparse_optim.py
@@ -5,6 +5,8 @@ import torch as th

 from ...utils import get_shared_mem_array, create_shared_mem_array
 from ...nn.pytorch import NodeEmbedding
+from ...cuda import nccl
+from ...partition import NDArrayPartition

 class SparseGradOptimizer(abc.ABC):
    r''' The abstract sparse optimizer.
@@ -26,10 +28,14 @@ class SparseGradOptimizer(abc.ABC):
        self._shared_cache = {}
        self._clean_grad = False
        self._opt_meta = {}
+        self._comm = None
+        self._first_step = True
+        self._device = None
        # hold released shared memory to let other process to munmap it first
        # otherwise it will crash the training
        self.shmem_buffer_holder = []

+        # if we are using shared memory for communication
        for emb in params:
            assert isinstance(emb, NodeEmbedding), \
                'DGL SparseOptimizer only supports dgl.nn.NodeEmbedding'
@@ -42,11 +48,86 @@ class SparseGradOptimizer(abc.ABC):
                    'MultiGPU rank for each embedding should be same.'
                assert self._world_size == emb.world_size, \
                    'MultiGPU world_size for each embedding should be same.'
+        assert not self._rank is None
+        assert not self._world_size is None

+    def step(self):
+        ''' The step function.
+
+        The step function is invoked at the end of every batch to update embeddings
+        '''
+        # on the first step, check to see if the grads are on the GPU
+        if self._first_step:
+            for emb in self._params:
+                for _, data in emb._trace:
+                    if data.grad.data.device.type == 'cuda':
+                        # create a communicator
+                        if self._device:
+                            assert self._device == data.grad.device, \
+                                "All gradients must be on the same device"
+                        else:
+                            self._device = data.grad.device
+                    else:
+                        assert not self._device, \
+                            "All gradients must be on the same device"
+            if self._device:
+                # device is only set if the grads are on a GPU
+                self._comm_setup()
+            else:
+                self._shared_setup()
+            self.setup(self._params)
+            self._first_step = False
+
+        if self._comm:
+            self._comm_step()
+        else:
+            self._shared_step()
+
+    def setup(self, params):
+        ''' This is function where subclasses can perform any setup they need
+            to. It will be called during the first step, and communicators or
+            shared memory will have been setup before this call.
+
+            Parameters
+            ----------
+            params : list of NodeEmbedding
+                The list of NodeEmbeddings.
+        '''
+
+
+    def _comm_setup(self):
+        # find a store to communicate the unique id through
+        if len(self._params) > 0:
+            store = self._params[0].store
+
+            if self._rank < 0:
+                self._comm = nccl.Communicator(1, 0, nccl.UniqueId())
+            else:
+                th.cuda.set_device(self._device)
+                if self._rank == 0:
+                    # root process broadcasts nccl id
+                    nccl_id = nccl.UniqueId()
+                    uid = str(nccl_id)
+                    store.set('nccl_root_id', uid)
+                else:
+                    uid = store.get('nccl_root_id')
+                    nccl_id = nccl.UniqueId(uid)
+                # needs to be set for nccl to work
+                self._comm = nccl.Communicator(self._world_size,
+                                               self._rank,
+                                               nccl_id)
+                if self._rank == 0:
+                    # clear the store entry for future communicators
+                    store.delete_key('nccl_root_id')
+                th.distributed.barrier()
+
+    def _shared_setup(self):
+        for emb in self._params:
            emb_name = emb.name
            if self._rank == 0: # the master gpu process
                opt_meta = create_shared_mem_array(emb_name+'_opt_meta', \
                    (self._world_size, self._world_size), th.int32).zero_()
+
            if self._rank == 0:
                emb.store.set(emb_name+'_opt_meta', emb_name)
                self._opt_meta[emb_name] = opt_meta
@@ -57,11 +138,64 @@ class SparseGradOptimizer(abc.ABC):
                    (self._world_size, self._world_size), th.int32)
                self._opt_meta[emb_name] = opt_meta

-    def step(self):
-        ''' The step function.
+    def _comm_step(self):
+        comm = self._comm
+        with th.no_grad():
+            idx_in = {}
+            grad_in = {}
+            for emb in self._params: # pylint: disable=too-many-nested-blocks
+                emb_name = emb.name
+                partition = emb.partition

-        The step function is invoked at the end of every batch to update embeddings
-        '''
+                if not partition:
+                    # use default partitioning
+                    partition = NDArrayPartition(
+                        emb.num_embeddings,
+                        self._world_size if self._world_size > 0 else 1,
+                        mode='remainder')
+
+                # we need to combine gradients from multiple forward paths
+                if len(emb._trace) == 0:
+                    idx = th.zeros((0,), dtype=th.long, device=self._device)
+                    grad = th.zeros((0, emb.embedding_dim),
+                                    dtype=th.float32,
+                                    device=self._device)
+                elif len(emb._trace) == 1:
+                    # the special case where we can use the tensors as is
+                    # without any memcpy's
+                    idx, grad = emb._trace[0]
+                    grad = grad.grad.data
+                else:
+                    idx = []
+                    grad = []
+                    for i, data in emb._trace:
+                        idx.append(i)
+                        grad.append(data.grad.data)
+                    idx = th.cat(idx, dim=0)
+                    grad = th.cat(grad, dim=0)
+
+                idx_in[emb_name], grad_in[emb_name] = \
+                    comm.sparse_all_to_all_push(
+                        idx, grad, partition=partition)
+                if emb.partition:
+                    # if the embedding is partitioned, map back to indexes
+                    # into the local tensor
+                    idx_in[emb_name] = partition.map_to_local(idx_in[emb_name])
+
+            if self._clean_grad:
+                # clean gradient track
+                for emb in self._params:
+                    emb.reset_trace()
+                self._clean_grad = False
+
+            for emb in self._params:
+                emb_name = emb.name
+                idx = idx_in[emb_name]
+                grad = grad_in[emb_name]
+                self.update(idx, grad, emb)
+
+
+    def _shared_step(self):
        with th.no_grad():
            # Frequently alloc and free shared memory to hold intermediate tensor is expensive
            # We cache shared memory buffers in shared_emb.
@@ -280,24 +414,33 @@ class SparseAdagrad(SparseGradOptimizer):
    def __init__(self, params, lr, eps=1e-10):
        super(SparseAdagrad, self).__init__(params, lr)
        self._eps = eps
+
+    def setup(self, params):
        # We need to register a state sum for each embedding in the kvstore.
        for emb in params:
            assert isinstance(emb, NodeEmbedding), \
                'SparseAdagrad only supports dgl.nn.NodeEmbedding'

-            if self._rank <= 0:
-                emb_name = emb.name
-                state = create_shared_mem_array(emb_name+'_state', \
-                    emb.weight.shape, th.float32).zero_()
-            if self._rank == 0:
-                if self._world_size > 1:
-                    emb.store.set(emb_name+'_opt', emb_name)
-            elif self._rank > 0:
-                # receive
-                emb_name = emb.name
-                emb.store.wait([emb_name+'_opt'])
-                state = get_shared_mem_array(emb_name+'_state', \
-                    emb.weight.shape, th.float32)
+            emb_name = emb.name
+            if th.device(emb.emb_tensor.device) == th.device('cpu'):
+                # if our embedding is on the CPU, our state also has to be
+                if self._rank <= 0:
+                    state = create_shared_mem_array(emb_name+'_state', \
+                        emb.weight.shape, th.float32).zero_()
+                if self._rank == 0:
+                    if self._world_size > 1:
+                        emb.store.set(emb_name+'_opt', emb_name)
+                elif self._rank > 0:
+                    # receive
+                    emb.store.wait([emb_name+'_opt'])
+                    state = get_shared_mem_array(emb_name+'_state', \
+                        emb.weight.shape, th.float32)
+            else:
+                # distributed state on on gpu
+                state = th.empty(
+                    emb.emb_tensor.shape,
+                    dtype=th.float32,
+                    device=emb.emb_tensor.device).zero_()
            emb.set_optm_state(state)

    def update(self, idx, grad, emb):
@@ -386,34 +529,48 @@ class SparseAdam(SparseGradOptimizer):
        self._beta1 = betas[0]
        self._beta2 = betas[1]
        self._eps = eps
+
+    def setup(self, params):
        # We need to register a state sum for each embedding in the kvstore.
        for emb in params:
            assert isinstance(emb, NodeEmbedding), \
                'SparseAdam only supports dgl.nn.NodeEmbedding'
-
-            if self._rank <= 0:
-                emb_name = emb.name
-                state_step = create_shared_mem_array(emb_name+'_step', \
-                    (emb.weight.shape[0],), th.float32).zero_()
-                state_mem = create_shared_mem_array(emb_name+'_mem', \
-                    emb.weight.shape, th.float32).zero_()
-                state_power = create_shared_mem_array(emb_name+'_power', \
-                    emb.weight.shape, th.float32).zero_()
-            if self._rank == 0:
-                emb_name = emb.name
-                if self._world_size > 1:
-                    emb.store.set(emb_name+'_opt', emb_name)
-            elif self._rank > 0:
-                # receive
-                emb_name = emb.name
-                emb.store.wait([emb_name+'_opt'])
-                state_step = get_shared_mem_array(emb_name+'_step', \
-                    (emb.weight.shape[0],), th.float32)
-                state_mem = get_shared_mem_array(emb_name+'_mem', \
-                    emb.weight.shape, th.float32)
-                state_power = get_shared_mem_array(emb_name+'_power', \
-                    emb.weight.shape, th.float32)
-
+            emb_name = emb.name
+            if th.device(emb.emb_tensor.device) == th.device('cpu'):
+                # if our embedding is on the CPU, our state also has to be
+                if self._rank <= 0:
+                    state_step = create_shared_mem_array(emb_name+'_step', \
+                        (emb.weight.shape[0],), th.float32).zero_()
+                    state_mem = create_shared_mem_array(emb_name+'_mem', \
+                        emb.weight.shape, th.float32).zero_()
+                    state_power = create_shared_mem_array(emb_name+'_power', \
+                        emb.weight.shape, th.float32).zero_()
+                if self._rank == 0:
+                    if self._world_size > 1:
+                        emb.store.set(emb_name+'_opt', emb_name)
+                elif self._rank > 0:
+                    # receive
+                    emb.store.wait([emb_name+'_opt'])
+                    state_step = get_shared_mem_array(emb_name+'_step', \
+                        (emb.weight.shape[0],), th.float32)
+                    state_mem = get_shared_mem_array(emb_name+'_mem', \
+                        emb.weight.shape, th.float32)
+                    state_power = get_shared_mem_array(emb_name+'_power', \
+                        emb.weight.shape, th.float32)
+            else:
+                # distributed state on on gpu
+                state_step = th.empty(
+                    [emb.emb_tensor.shape[0]],
+                    dtype=th.float32,
+                    device=emb.emb_tensor.device).zero_()
+                state_mem = th.empty(
+                    emb.emb_tensor.shape,
+                    dtype=th.float32,
+                    device=emb.emb_tensor.device).zero_()
+                state_power = th.empty(
+                    emb.emb_tensor.shape,
+                    dtype=th.float32,
+                    device=emb.emb_tensor.device).zero_()
            state = (state_step, state_mem, state_power)
            emb.set_optm_state(state)


--- a/python/dgl/partition.py
+++ b/python/dgl/partition.py
@@ -420,5 +420,23 @@ class NDArrayPartition(object):
        """
        return self._partition

+    def local_size(self, part):
+        """ Get the number of rows/items assigned to the given part.
+        """
+        return _CAPI_DGLNDArrayPartitionGetPartSize(self._partition, part)
+
+    def map_to_local(self, idxs):
+        """ Convert the set of global indices to local indices
+        """
+        return F.zerocopy_from_dgl_ndarray(_CAPI_DGLNDArrayPartitionMapToLocal(
+            self._partition,
+            F.zerocopy_to_dgl_ndarray(idxs)))
+
+    def map_to_global(self, idxs, part_id):
+        """ Convert the set of local indices ot global indices
+        """
+        return F.zerocopy_from_dgl_ndarray(_CAPI_DGLNDArrayPartitionMapToGlobal(
+            self._partition, F.zerocopy_to_dgl_ndarray(idxs), part_id))
+

 _init_api("dgl.partition")
--- a/src/partition/cuda/partition_op.cu
+++ b/src/partition/cuda/partition_op.cu
@@ -23,6 +23,7 @@ template<typename IdType> __global__ void _MapProcByRemainder(
    const int64_t num_index,
    const int64_t num_proc,
    IdType * const proc_id) {
+  assert(num_index <= gridDim.x*blockDim.x);
  const int64_t idx = blockDim.x*static_cast<int64_t>(blockIdx.x)+threadIdx.x;

  if (idx < num_index) {
@@ -36,6 +37,7 @@ __global__ void _MapProcByMaskRemainder(
    const int64_t num_index,
    const IdType mask,
    IdType * const proc_id) {
+  assert(num_index <= gridDim.x*blockDim.x);
  const int64_t idx = blockDim.x*static_cast<int64_t>(blockIdx.x)+threadIdx.x;

  if (idx < num_index) {
@@ -49,6 +51,7 @@ __global__ void _MapLocalIndexByRemainder(
    IdType * const out,
    const int64_t num_items,
    const int comm_size) {
+  assert(num_items <= gridDim.x*blockDim.x);
  const int64_t idx = threadIdx.x+blockDim.x*blockIdx.x;

  if (idx < num_items) {
@@ -56,6 +59,23 @@ __global__ void _MapLocalIndexByRemainder(
  }
 }

+template<typename IdType>
+__global__ void _MapGlobalIndexByRemainder(
+    const IdType * const in,
+    IdType * const out,
+    const int part_id,
+    const int64_t num_items,
+    const int comm_size) {
+  assert(num_items <= gridDim.x*blockDim.x);
+  const int64_t idx = threadIdx.x+blockDim.x*blockIdx.x;
+
+  assert(part_id < comm_size);
+
+  if (idx < num_items) {
+    out[idx] = (in[idx] * comm_size) + part_id;
+  }
+}
+
 template <DLDeviceType XPU, typename IdType>
 std::pair<IdArray, NDArray>
 GeneratePermutationFromRemainder(
@@ -80,6 +100,7 @@ GeneratePermutationFromRemainder(
    return result;
  }

+  result.first = aten::NewIdArray(num_in, ctx, sizeof(IdType)*8);
  result.second = aten::Full(0, num_parts, sizeof(int64_t)*8, ctx);
  int64_t * out_counts = static_cast<int64_t*>(result.second->data);
  if (num_in == 0) {
@@ -117,7 +138,6 @@ GeneratePermutationFromRemainder(
  // then create a permutation array that groups processors together by
  // performing a radix sort
  Workspace<IdType> proc_id_out(device, ctx, num_in);
-  result.first = aten::NewIdArray(num_in, ctx, sizeof(IdType)*8);
  IdType * perm_out = static_cast<IdType*>(result.first->data);
  {
    IdArray perm_in = aten::Range(0, num_in, sizeof(IdType)*8, ctx);
@@ -199,24 +219,29 @@ IdArray MapToLocalFromRemainder(
  const auto& ctx = global_idx->ctx;
  cudaStream_t stream = CUDAThreadEntry::ThreadLocal()->stream;

-  IdArray local_idx = aten::NewIdArray(global_idx->shape[0], ctx,
-      sizeof(IdType)*8);
-
-  const dim3 block(128);
-  const dim3 grid((global_idx->shape[0] +block.x-1)/block.x);
-
-  CUDA_KERNEL_CALL(
-      _MapLocalIndexByRemainder,
-      grid,
-      block,
-      0,
-      stream,
-      static_cast<const IdType*>(global_idx->data),
-      static_cast<IdType*>(local_idx->data),
-      global_idx->shape[0],
-      num_parts);
-
-  return local_idx;
+  if (num_parts > 1) {
+    IdArray local_idx = aten::NewIdArray(global_idx->shape[0], ctx,
+        sizeof(IdType)*8);
+
+    const dim3 block(128);
+    const dim3 grid((global_idx->shape[0] +block.x-1)/block.x);
+
+    CUDA_KERNEL_CALL(
+        _MapLocalIndexByRemainder,
+        grid,
+        block,
+        0,
+        stream,
+        static_cast<const IdType*>(global_idx->data),
+        static_cast<IdType*>(local_idx->data),
+        global_idx->shape[0],
+        num_parts);
+
+    return local_idx;
+  } else {
+    // no mapping to be done
+    return global_idx;
+  }
 }

 template IdArray
@@ -228,6 +253,57 @@ MapToLocalFromRemainder<kDLGPU, int64_t>(
        int num_parts,
        IdArray in_idx);

+template <DLDeviceType XPU, typename IdType>
+IdArray MapToGlobalFromRemainder(
+    const int num_parts,
+    IdArray local_idx,
+    const int part_id) {
+  CHECK_LT(part_id, num_parts) << "Invalid partition id " << part_id <<
+      "/" << num_parts;
+  CHECK_GE(part_id, 0) << "Invalid partition id " << part_id <<
+      "/" << num_parts;
+
+  const auto& ctx = local_idx->ctx;
+  cudaStream_t stream = CUDAThreadEntry::ThreadLocal()->stream;
+
+  if (num_parts > 1) {
+    IdArray global_idx = aten::NewIdArray(local_idx->shape[0], ctx,
+        sizeof(IdType)*8);
+
+    const dim3 block(128);
+    const dim3 grid((local_idx->shape[0] +block.x-1)/block.x);
+
+    CUDA_KERNEL_CALL(
+        _MapGlobalIndexByRemainder,
+        grid,
+        block,
+        0,
+        stream,
+        static_cast<const IdType*>(local_idx->data),
+        static_cast<IdType*>(global_idx->data),
+        part_id,
+        global_idx->shape[0],
+        num_parts);
+
+    return global_idx;
+  } else {
+    // no mapping to be done
+    return local_idx;
+  }
+}
+
+template IdArray
+MapToGlobalFromRemainder<kDLGPU, int32_t>(
+        int num_parts,
+        IdArray in_idx,
+        int part_id);
+template IdArray
+MapToGlobalFromRemainder<kDLGPU, int64_t>(
+        int num_parts,
+        IdArray in_idx,
+        int part_id);
+
+

 }  // namespace impl
 }  // namespace partition

--- a/src/partition/ndarray_partition.cc
+++ b/src/partition/ndarray_partition.cc
@@ -78,6 +78,31 @@ class RemainderPartition : public NDArrayPartition {
    // should be unreachable
    return IdArray{};
  }
+
+  IdArray MapToGlobal(
+      IdArray in_idx,
+      const int part_id) const override {
+    auto ctx = in_idx->ctx;
+#ifdef DGL_USE_CUDA
+    if (ctx.device_type == kDLGPU) {
+      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
+        return impl::MapToGlobalFromRemainder<kDLGPU, IdType>(
+            NumParts(), in_idx, part_id);
+      });
+    }
+#endif
+
+    LOG(FATAL) << "Remainder based partitioning for the CPU is not yet "
+        "implemented.";
+    // should be unreachable
+    return IdArray{};
+  }
+
+  int64_t PartSize(const int part_id) const override {
+    CHECK_LT(part_id, NumParts()) << "Invalid part ID (" << part_id << ") for "
+        "partition of size " << NumParts() << ".";
+    return ArraySize() / NumParts() + (part_id < ArraySize() % NumParts());
+  }
 };

 NDArrayPartitionRef CreatePartitionRemainderBased(
@@ -95,5 +120,31 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionCreateRemainderBased")
  *rv = CreatePartitionRemainderBased(array_size, num_parts);
 });

+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGetPartSize")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+  NDArrayPartitionRef part = args[0];
+  int part_id = args[1];
+
+  *rv = part->PartSize(part_id);
+});
+
+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToLocal")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+  NDArrayPartitionRef part = args[0];
+  IdArray idxs = args[1];
+
+  *rv = part->MapToLocal(idxs);
+});
+
+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal")
+.set_body([] (DGLArgs args, DGLRetValue* rv) {
+  NDArrayPartitionRef part = args[0];
+  IdArray idxs = args[1];
+  const int part_id = args[2];
+
+  *rv = part->MapToGlobal(idxs, part_id);
+});
+
+
 }  // namespace partition
 }  // namespace dgl
--- a/src/partition/ndarray_partition.h
+++ b/src/partition/ndarray_partition.h
@@ -9,6 +9,7 @@
 #define DGL_PARTITION_NDARRAY_PARTITION_H_

 #include <dgl/runtime/object.h>
+#include <dgl/packed_func_ext.h>
 #include <dgl/array.h>
 #include <utility>

@@ -64,6 +65,29 @@ class NDArrayPartition : public runtime::Object {
  virtual IdArray MapToLocal(
      IdArray in_idx) const = 0;

+  /**
+   * @brief Generate the global indices (the numbering unique across all
+   * processors) from a set of local indices.
+   *
+   * @param in_idx The local indices.
+   * @param part_id The part id.
+   *
+   * @return The global indices.
+   */
+  virtual IdArray MapToGlobal(
+      IdArray in_idx,
+      int part_id) const = 0;
+
+  /**
+   * @brief Get the number of rows/items assigned to the given part.
+   *
+   * @param part_id The part id.
+   *
+   * @return The size.
+   */
+  virtual int64_t PartSize(
+          int part_id) const = 0;
+
  /**
   * @brief Get the first dimension of the partitioned array.
   *

--- a/src/partition/partition_op.h
+++ b/src/partition/partition_op.h
@@ -51,6 +51,25 @@ IdArray MapToLocalFromRemainder(
    int num_parts,
    IdArray global_idx);

+/**
+ * @brief Generate the set of global indices from the local indices, using
+ * remainder. That is, for each index `i` in `local_idx`, the global index
+ * is computed as `local_idx[i] * num_parts + part_id`.
+ *
+ * @tparam XPU The type of device to run on.
+ * @tparam IdType The type of the index.
+ * @param num_parts The number parts the array id divided into.
+ * @param local_idx The array of local indices to map.
+ * @param part_id The id of the current part.
+ *
+ * @return The array of global indices.
+ */
+template <DLDeviceType XPU, typename IdType>
+IdArray MapToGlobalFromRemainder(
+    int num_parts,
+    IdArray local_idx,
+    int part_id);
+

 }  // namespace impl
 }  // namespace partition

--- a/src/runtime/cuda/nccl_api.cu
+++ b/src/runtime/cuda/nccl_api.cu
@@ -138,10 +138,6 @@ std::pair<IdArray, NDArray> SparsePush(
    IdArray in_idx,
    NDArray in_value,
    NDArrayPartitionRef part) {
-  CHECK_EQ(in_idx->shape[0], in_value->shape[0]) <<
-      "Leading dimension of indices (" << in_idx->shape[0] << ") must match "
-      "leading dimension of values (" << in_value->shape[0] << ").";
-
  const auto& ctx = in_idx->ctx;
  CHECK_EQ(ctx, in_value->ctx) << "Indices and values must be on the same "
      "device";
@@ -150,9 +146,15 @@ std::pair<IdArray, NDArray> SparsePush(
  // TODO(dlasalle): Get the stream from the device context.
  cudaStream_t stream = 0;

-  CHECK_EQ(in_idx->ndim, 1) << "Indices must be 1-dimensional";
+  CHECK_LE(in_idx->ndim, 1) << "The tensor of sending indices must be of "
+      "dimension one (or empty).";
+  const int64_t num_in = in_idx->ndim > 0 ? in_idx->shape[0] : 0;
+
+  CHECK_EQ(num_in, in_value->ndim > 0 ? in_value->shape[0] : 0) <<
+      "Leading dimension of indices (" << num_in << ") must match "
+      "leading dimension of values (" <<
+      (in_value->ndim > 0 ? in_value->shape[0] : 0) << ").";

-  const int64_t num_in = in_idx->shape[0];
  int64_t num_feat = 1;
  for (int d = 1; d < in_value->ndim; ++d) {
    num_feat *= in_value->shape[d];
@@ -297,10 +299,9 @@ NDArray SparsePull(

  cudaStream_t stream = CUDAThreadEntry::ThreadLocal()->stream;

-  CHECK_EQ(req_idx->ndim, 1) << "The tensor of requested indices must be of "
-      "dimension one.";
-
-  const int64_t num_in = req_idx->shape[0];
+  CHECK_LE(req_idx->ndim, 1) << "The tensor of requested indices must be of "
+      "dimension one (or empty).";
+  const int64_t num_in = req_idx->ndim > 0 ? req_idx->shape[0] : 0;
  int64_t num_feat = 1;
  for (int d = 1; d < local_tensor->ndim; ++d) {
    num_feat *= local_tensor->shape[d];
@@ -328,7 +329,7 @@ NDArray SparsePull(
      static_cast<const int64_t*>(part_perm.second->data);

  // permute requests
-  {
+  if (num_in > 0) {
    const dim3 block(256);
    const dim3 grid((num_in+block.x-1)/block.x);


--- a/tests/cpp/test_partition.cc
+++ b/tests/cpp/test_partition.cc
 #include <gtest/gtest.h>
 #include "../../src/partition/ndarray_partition.h"
+#include "./common.h"


 using namespace dgl;
@@ -7,17 +8,16 @@ using namespace dgl::partition;


 template<DLDeviceType XPU, typename IdType>
-void _TestRemainder()
-{
+void _TestRemainder_GeneratePermutation() {
  const int64_t size = 160000;
  const int num_parts = 7;
  NDArrayPartitionRef part = CreatePartitionRemainderBased(
      size,  num_parts);

  IdArray idxs = aten::Range(0, size/10, sizeof(IdType)*8,
-      DGLContext{XPU, 0}); 
+      DGLContext{XPU, 0});

-  std::pair<IdArray, IdArray> result = part->GeneratePermutation(idxs); 
+  std::pair<IdArray, IdArray> result = part->GeneratePermutation(idxs);

  // first part of result should be the permutation
  IdArray perm = result.first.CopyTo(DGLContext{kDLCPU, 0});
@@ -48,10 +48,43 @@ void _TestRemainder()
  }
 }

+template<DLDeviceType XPU, typename IdType>
+void _TestRemainder_MapToX() {
+  const int64_t size = 160000;
+  const int num_parts = 7;
+  NDArrayPartitionRef part = CreatePartitionRemainderBased(
+      size,  num_parts);
+
+  for (int part_id = 0; part_id < num_parts; ++part_id) {
+    IdArray local = aten::Range(0, part->PartSize(part_id), sizeof(IdType)*8,
+        DGLContext{XPU, 0});
+    IdArray global = part->MapToGlobal(local, part_id);
+    IdArray act_local = part->MapToLocal(global).CopyTo(CPU);
+
+    // every global index should have the same remainder as the part id
+    ASSERT_EQ(global->shape[0], local->shape[0]);
+    global = global.CopyTo(CPU);
+    for (size_t i = 0; i < global->shape[0]; ++i) {
+      EXPECT_EQ(Ptr<IdType>(global)[i] % num_parts, part_id) << "i=" << i <<
+          ", num_parts=" << num_parts << ", part_id=" << part_id;
+    }
+
+    // the remapped local indices to should match the original
+    local = local.CopyTo(CPU);
+    ASSERT_EQ(local->shape[0], act_local->shape[0]);
+    for (size_t i = 0; i < act_local->shape[0]; ++i) {
+      EXPECT_EQ(Ptr<IdType>(local)[i], Ptr<IdType>(act_local)[i]);
+    }
+  }
+}
+
 TEST(PartitionTest, TestRemainderPartition) {
 #ifdef DGL_USE_CUDA
-  _TestRemainder<kDLGPU, int32_t>();
-  _TestRemainder<kDLGPU, int64_t>();
+  _TestRemainder_GeneratePermutation<kDLGPU, int32_t>();
+  _TestRemainder_GeneratePermutation<kDLGPU, int64_t>();
+
+  _TestRemainder_MapToX<kDLGPU, int32_t>();
+  _TestRemainder_MapToX<kDLGPU, int64_t>();
 #endif

  // CPU is not implemented

--- a/tests/pytorch/test_optim.py
+++ b/tests/pytorch/test_optim.py
@@ -144,6 +144,9 @@ def start_sparse_adam_worker(rank, world_size, has_zero_grad=False, num_embs=128
 @unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
 @pytest.mark.parametrize("num_workers", [2, 4, 8])
 def test_multiprocess_sparse_adam(num_workers):
+    if F.ctx().type == 'cuda' and th.cuda.device_count() < num_workers:
+        pytest.skip("Not enough GPUs to run test.")
+
    worker_list = []

    ctx = mp.get_context('spawn')
@@ -159,6 +162,9 @@ def test_multiprocess_sparse_adam(num_workers):
 @unittest.skipIf(os.name == 'nt', reason='Do not support windows yet')
 @pytest.mark.parametrize("num_workers", [2, 4, 8])
 def test_multiprocess_sparse_adam_zero_step(num_workers):
+    if F.ctx().type == 'cuda' and th.cuda.device_count() < num_workers:
+        pytest.skip("Not enough GPUs to run test.")
+
    worker_list = []

    ctx = mp.get_context('spawn')