[Refactor] Replace third_party/nccl with PyTorch's NCCL backend (#4989)

* expose GeneratePermutation * add sparse_all_to_all_push * add sparse_all_to_all_pull * add unit test * handle world_size=1 * remove python nccl wrapper * remove the nccl dependency * use pinned memory to speedup D2H copy * fix lint * resolve comments * fix lint * fix ut * resolve comments

[Refactor] Replace third_party/nccl with PyTorch's NCCL backend (#4989)
* expose GeneratePermutation * add sparse_all_to_all_push * add sparse_all_to_all_pull * add unit test * handle world_size=1 * remove python nccl wrapper * remove the nccl dependency * use pinned memory to speedup D2H copy * fix lint * resolve comments * fix lint * fix ut * resolve comments
8d5d8962 · Xin Yao · GitHub · b1ec112e · 8d5d8962 · 8d5d8962
Unverified Commit 8d5d8962 authored Mar 08, 2023 by Xin Yao Committed by GitHub Mar 08, 2023
17 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,9 +22,6 @@
 [submodule "third_party/nanoflann"]
 	path = third_party/nanoflann
 	url = https://github.com/jlblancoc/nanoflann
-[submodule "third_party/nccl"]
-	path = third_party/nccl
-	url = https://github.com/nvidia/nccl
 [submodule "third_party/libxsmm"]
 	path = third_party/libxsmm
 	url = https://github.com/hfp/libxsmm.git

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,8 +23,6 @@ endif()
 # and add set(OPTION VALUE) to override these build options.
 # Alernatively, use cmake -DOPTION=VALUE through command-line.
 dgl_option(USE_CUDA "Build with CUDA" OFF)
-dgl_option(USE_NCCL "Build with NCCL support" OFF)
-dgl_option(USE_SYSTEM_NCCL "Build using system's NCCL library" OFF)
 dgl_option(USE_OPENMP "Build with OpenMP" ON)
 dgl_option(USE_AVX "Build with AVX optimization" OFF)
 dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON)
@@ -171,25 +169,7 @@ list(APPEND DGL_SRC ${DGL_RPC_SRC})
 if(USE_CUDA)
  dgl_config_cuda(DGL_CUDA_SRC)
  list(APPEND DGL_SRC ${DGL_CUDA_SRC})
-  if(USE_NCCL)
-    add_definitions(-DDGL_USE_NCCL)
-    if (USE_SYSTEM_NCCL)
-      include(cmake/util/FindNccl.cmake)
-      include_directories(${NCCL_INCLUDE_DIR})
-    else()
-      include(cmake/modules/NCCL.cmake)
-      cuda_include_directories(BEFORE ${NCCL_INCLUDE_DIR})
-    endif()
-  endif(USE_NCCL)
-
-  list(APPEND DGL_LINKER_LIBS ${NCCL_LIBRARY})
-endif(USE_CUDA)
-
-if(USE_CUDA)
  cuda_add_library(dgl SHARED ${DGL_SRC})
-  if (USE_NCCL AND NOT USE_SYSTEM_NCCL)
-    add_dependencies(dgl nccl_external)
-  endif()
 else(USE_CUDA)
  add_library(dgl SHARED ${DGL_SRC})
 endif(USE_CUDA)

--- a/cmake/modules/NCCL.cmake
+++ b/cmake/modules/NCCL.cmake
-include(ExternalProject)
-
-# set path to submodule
-set(NCCL_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/nccl")
-
-# NCCL doesn't have CMAKE, so build externally
-ExternalProject_Add(nccl_external
-  SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl
-  BUILD_IN_SOURCE 1
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND
-    env
-    make
-    "src.build"
-    "-j"
-    "BUILDDIR=${NCCL_BUILD_DIR}"
-  BUILD_BYPRODUCTS "${NCCL_BUILD_DIR}/lib/libnccl_static.a"
-  INSTALL_COMMAND ""
-  )
-
-# set output variables
-set(NCCL_FOUND TRUE)
-set(NCCL_LIBRARY "${NCCL_BUILD_DIR}/lib/libnccl_static.a")
-set(NCCL_INCLUDE_DIR "${NCCL_BUILD_DIR}/include")
--- a/cmake/util/FindNccl.cmake
+++ b/cmake/util/FindNccl.cmake
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Tries to find NCCL headers and libraries.
-#
-# Usage of this module as follows:
-#
-#  find_package(NCCL)
-#
-# Variables used by this module, they can change the default behaviour and need
-# to be set before calling find_package:
-#
-#  NCCL_ROOT - When set, this path is inspected instead of standard library
-#              locations as the root of the NCCL installation.
-#              The environment variable NCCL_ROOT overrides this variable.
-#
-# This module defines
-#  Nccl_FOUND, whether nccl has been found
-#  NCCL_INCLUDE_DIR, directory containing header
-#  NCCL_LIBRARY, directory containing nccl library
-#  NCCL_LIB_NAME, nccl library name
-#  USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the 
-#                     location of the nccl library. This would disable
-#                     switching between static and shared.
-#
-# This module assumes that the user has already called find_package(CUDA)
-#
-# This file is from https://github.com/dmlc/xgboost, with modifications to
-# check the version.
-
-if (NCCL_LIBRARY)
-  if(NOT USE_NCCL_LIB_PATH)
-    # Don't cache NCCL_LIBRARY to enable switching between static and shared.
-    unset(NCCL_LIBRARY CACHE)
-  endif(NOT USE_NCCL_LIB_PATH)
-endif()
-
-if (BUILD_WITH_SHARED_NCCL)
-  # libnccl.so
-  set(NCCL_LIB_NAME nccl)
-else ()
-  # libnccl_static.a
-  set(NCCL_LIB_NAME nccl_static)
-endif (BUILD_WITH_SHARED_NCCL)
-
-find_path(NCCL_INCLUDE_DIR
-  NAMES nccl.h
-  PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
-
-# make sure it has point to point support
-file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_CODE REGEX "^#define[ \t]+NCCL_VERSION_CODE[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
-string(REGEX REPLACE "^.*NCCL_VERSION_CODE[ \t]+([0-9]+).*$" "\\1" NCCL_VERSION "${NCCL_VERSION_CODE}")
-
-
-find_library(NCCL_LIBRARY
-  NAMES ${NCCL_LIB_NAME}
-  PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
-
-if ("${NCCL_VERSION}" LESS "2700")
-  message(FATAL_ERROR "Require nccl >= 2700, but found ${NCCL_LIBRARY}==${NCCL_VERSION}")
-else()
-  message(STATUS "Using nccl library: ${NCCL_LIBRARY} ${NCCL_VERSION}")
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Nccl DEFAULT_MSG
-                                  NCCL_INCLUDE_DIR NCCL_LIBRARY)
-
-mark_as_advanced(
-  NCCL_INCLUDE_DIR
-  NCCL_LIBRARY
-)
--- a/python/dgl/cuda/__init__.py
+++ b/python/dgl/cuda/__init__.py
 """ CUDA wrappers """
-from . import nccl
+from .. import backend as F
+
+if F.get_preferred_backend() == "pytorch":
+    from . import nccl
--- a/python/dgl/cuda/nccl.py
+++ b/python/dgl/cuda/nccl.py
-"""API creating NCCL communicators."""
+"""API wrapping NCCL primitives."""

-from .. import backend as F
-from .._ffi.function import _init_api
+import torch
+import torch.distributed as dist

-_COMM_MODES_MAP = {"remainder": 0}

-
-class UniqueId(object):
-    """Class for allowing python code to create and communicate NCCL Unique
-    IDs, needed for creating communicators.
-    """
-
-    def __init__(self, id_str=None):
-        """Create an object reference the current NCCL unique id."""
-        if id_str:
-            if isinstance(id_str, bytes):
-                id_str = id_str.decode("utf-8")
-            self._handle = _CAPI_DGLNCCLUniqueIdFromString(id_str)
-        else:
-            self._handle = _CAPI_DGLNCCLGetUniqueId()
-
-    def get(self):
-        """Get the C-handle for this object."""
-        return self._handle
-
-    def __str__(self):
-        return _CAPI_DGLNCCLUniqueIdToString(self._handle)
-
-    def __repr__(self):
-        return "UniqueId[{}]".format(str(self))
-
-    def __eq__(self, other):
-        return str(self) == str(other)
-
-
-class Communicator(object):
-    """High-level wrapper for NCCL communication."""
-
-    def __init__(self, size, rank, unique_id):
-        """Create a new NCCL communicator.
-
-        Parameters
-        ----------
-        size : int
-            The number of processes in the communicator.
-        rank : int
-            The rank of the current process in the communicator.
-        unique_id : NCCLUniqueId
-            The unique id of the root process (rank=0).
-
-        Examples
-        --------
-
-        >>> from dgl.cuda.nccl import Communicator, UniqueId
-
-        The root process will generate a unique NCCL id and communicate it
-        to the other processes.
-
-        >>> uid = UniqueId()
-        >>> store.set('nccl_root_id', str(uid))
-
-        And all other processes create unique ids from the root processes.
-
-        >>> uid = UniqueId(store.get('nccl_root_id'))
-
-        Then, all processes should create the communicator.
-
-        >>> comm = Communicator(world_size, rank, uid)
-        """
-        assert rank < size, (
-            "The rank of a process must be less than the "
-            "size of the communicator."
-        )
-        self._handle = _CAPI_DGLNCCLCreateComm(size, rank, unique_id.get())
-        self._rank = rank
-        self._size = size
-
-    def sparse_all_to_all_push(self, idx, value, partition):
+def sparse_all_to_all_push(idx, value, partition):
    """Perform an all-to-all-v operation, where by all processors send out
    a set of indices and corresponding values. Indices and values,
    corresponding to the current process, will copied into the output
    arrays.

+    Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
+
    Parameters
    ----------
-        idx : tensor
+    idx : torch.Tensor
        The 1D set of indices to send to other processors.
-        value : tensor
+    value : torch.Tensor
        The multi-dimension set of values to send to other processors.
        The first dimension must match that of `idx`.
    partition : NDArrayPartition
@@ -95,9 +25,9 @@ class Communicator(object):

    Returns
    -------
-        tensor
+    torch.Tensor
        The 1D tensor of the recieved indices.
-        tensor
+    torch.Tensor
        The set of recieved values.

    Examples
@@ -108,7 +38,7 @@ class Communicator(object):
    striped across processes can be generated via:

    >>> from dgl.partition import NDArrayPartition
-        >>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' )
+    >>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')

    With this partition, each processor can send values to be associatd
    with vertices in the graph. So if we have an array `global_idxs` of all of
@@ -116,7 +46,7 @@ class Communicator(object):
    `global_values` containing the new values associated with the neighbors,
    we communicate them to the own processes via:

-        >>> my_idxs, my_values = comm.sparse_all_to_all_push(global_idxs, global_values, part)
+    >>> my_idxs, my_values = nccl.sparse_all_to_all_push(global_idxs, global_values, part)

    This communication pattern is common when communicating gradient
    updates for node embeddings.
@@ -128,26 +58,54 @@ class Communicator(object):
    indices for processe 0 of '[0, 8, 10, 0, 2, 4, 8, 8]', and for
    process 1 of '[3, 9, 5, 9]'.
    """
-        out_idx, out_value = _CAPI_DGLNCCLSparseAllToAllPush(
-            self.get(),
-            F.zerocopy_to_dgl_ndarray(idx),
-            F.zerocopy_to_dgl_ndarray(value),
-            partition.get(),
-        )
-        return (
-            F.zerocopy_from_dgl_ndarray(out_idx),
-            F.zerocopy_from_dgl_ndarray(out_value),
+    if not dist.is_initialized() or dist.get_world_size() == 1:
+        return idx, value
+    assert (
+        dist.get_backend() == "nccl"
+    ), "requires NCCL backend to communicate CUDA tensors."
+
+    perm, send_splits = partition.generate_permutation(idx)
+    perm = perm.long()
+
+    # Get receive splits.
+    recv_splits = torch.empty_like(send_splits)
+    dist.all_to_all_single(recv_splits, send_splits)
+
+    # Use pinned memory to speedup D2H copy.
+    recv_splits = recv_splits.to("cpu", non_blocking=True)
+    send_splits = send_splits.to("cpu", non_blocking=True)
+    send_idx = idx[perm]
+    send_value = value[perm]
+    # Wait D2H copy finish.
+    torch.cuda.current_stream().synchronize()
+    recv_sum = recv_splits.sum()
+    recv_splits = recv_splits.tolist()
+    send_splits = send_splits.tolist()
+
+    # Send idx.
+    recv_idx = torch.empty((recv_sum,), dtype=idx.dtype, device=idx.device)
+    dist.all_to_all_single(recv_idx, send_idx, recv_splits, send_splits)
+
+    # Send value.
+    recv_value = torch.empty(
+        (recv_sum, *value.shape[1:]), dtype=value.dtype, device=value.device
    )
+    dist.all_to_all_single(recv_value, send_value, recv_splits, send_splits)
+
+    return recv_idx, recv_value
+

-    def sparse_all_to_all_pull(self, req_idx, value, partition):
+def sparse_all_to_all_pull(req_idx, value, partition):
    """Perform an all-to-all-v operation, where by all processors request
    the values corresponding to their set of indices.

+    Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
+
    Parameters
    ----------
-        req_idx : IdArray
+    req_idx : torch.Tensor
        The set of indices this processor is requesting.
-        value : NDArray
+    value : torch.Tensor
        The multi-dimension set of values that can be requested from
        this processor.
    partition : NDArrayPartition
@@ -156,7 +114,7 @@ class Communicator(object):

    Returns
    -------
-        tensor
+    torch.Tensor
        The set of recieved values, corresponding to `req_idx`.

    Examples
@@ -167,7 +125,7 @@ class Communicator(object):
    striped across processes can be generated via:

    >>> from dgl.partition import NDArrayPartition
-        >>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' )
+    >>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')

    With this partition, each processor can request values/features
    associated with vertices in the graph. So in the case where we have
@@ -175,7 +133,7 @@ class Communicator(object):
    has a tensor 'node_feat' storing the features of nodes it owns in
    the partition, the features can be requested via:

-        >>> nbr_values = comm.sparse_all_to_all_pull(nbr_idxs, node_feat, part)
+    >>> nbr_values = nccl.sparse_all_to_all_pull(nbr_idxs, node_feat, part)

    Then two the arrays 'nbr_idxs' and 'nbr_values' forms the sparse
    set of features, where 'nbr_idxs[i]' is the global node id, and
@@ -183,48 +141,49 @@ class Communicator(object):
    communication pattern is useful for node features or node
    embeddings.
    """
-        out_value = _CAPI_DGLNCCLSparseAllToAllPull(
-            self.get(),
-            F.zerocopy_to_dgl_ndarray(req_idx),
-            F.zerocopy_to_dgl_ndarray(value),
-            partition.get(),
+    if not dist.is_initialized() or dist.get_world_size() == 1:
+        return value[req_idx.long()]
+    assert (
+        dist.get_backend() == "nccl"
+    ), "requires NCCL backend to communicate CUDA tensors."
+
+    perm, req_splits = partition.generate_permutation(req_idx)
+    perm = perm.long()
+
+    # Get response splits.
+    resp_splits = torch.empty_like(req_splits)
+    dist.all_to_all_single(resp_splits, req_splits)
+
+    # Use pinned memory to speedup D2H copy.
+    resp_splits = resp_splits.to("cpu", non_blocking=True)
+    req_splits = req_splits.to("cpu", non_blocking=True)
+    req_idx = req_idx[perm]
+    # Wait D2H copy finish.
+    torch.cuda.current_stream().synchronize()
+    resp_sum = resp_splits.sum()
+    resp_splits = resp_splits.tolist()
+    req_splits = req_splits.tolist()
+
+    # Gather requested indices.
+    resp_idx = torch.empty(
+        (resp_sum,), dtype=req_idx.dtype, device=req_idx.device
    )
-        return F.zerocopy_from_dgl_ndarray(out_value)
-
-    def get(self):
-        """Get the C-Handle for this object."""
-        return self._handle
-
-    def rank(self):
-        """Get the rank of this process in this communicator.
+    dist.all_to_all_single(resp_idx, req_idx, resp_splits, req_splits)

-        Returns
-        -------
-        int
-            The rank of this process.
-        """
-        return self._rank
+    # Convert requested indices to local indices depending on partition.
+    if resp_sum > 0:
+        resp_idx = partition.map_to_local(resp_idx)

-    def size(self):
-        """Get the size of this communicator.
-
-        Returns
-        -------
-        int
-            The number of processes in this communicator.
-        """
-        return self._size
-
-
-def is_supported():
-    """Check if DGL was built with NCCL support.
-
-    Returns
-    -------
-    bool
-        True if NCCL support was built in.
-    """
-    return _CAPI_DGLNCCLHasSupport()
+    # Collect the request value.
+    req_value = torch.empty(
+        (req_idx.size(0), *value.shape[1:]),
+        dtype=value.dtype,
+        device=value.device,
+    )
+    dist.all_to_all_single(req_value, value[resp_idx], req_splits, resp_splits)

+    # Permute the value back into the requested order.
+    return_value = torch.empty_like(req_value)
+    return_value[perm] = req_value

-_init_api("dgl.cuda.nccl")
+    return return_value
--- a/python/dgl/nn/pytorch/sparse_emb.py
+++ b/python/dgl/nn/pytorch/sparse_emb.py
@@ -9,7 +9,6 @@ from ...partition import NDArrayPartition
 from ...utils import create_shared_mem_array, get_shared_mem_array

 _STORE = None
-_COMM = None


 class NodeEmbedding:  # NodeEmbedding
@@ -78,7 +77,6 @@ class NodeEmbedding:  # NodeEmbedding
        partition=None,
    ):
        global _STORE
-        global _COMM

        if device is None:
            device = th.device("cpu")
@@ -132,25 +130,7 @@ class NodeEmbedding:  # NodeEmbedding
                )
            self._tensor = emb
        else:  # embeddings is stored in GPU memory.
-            # setup nccl communicator
-            if _COMM is None:
-                if rank < 0:
-                    _COMM = nccl.Communicator(1, 0, nccl.UniqueId())
-                else:
-                    # needs to be set for nccl to work
-                    th.cuda.set_device(device)
-                    if rank == 0:
-                        # root process broadcasts nccl id
-                        nccl_id = nccl.UniqueId()
-                        self._store.set("nccl_root_id_sparse_emb", str(nccl_id))
-                    else:
-                        nccl_id = nccl.UniqueId(
-                            self._store.get("nccl_root_id_sparse_emb")
-                        )
-                    _COMM = nccl.Communicator(
-                        self._world_size, self._rank, nccl_id
-                    )
-            self._comm = _COMM
+            self._comm = True

            if not self._partition:
                # for communication we need a partition
@@ -161,7 +141,7 @@ class NodeEmbedding:  # NodeEmbedding
                )

            # create local tensors for the weights
-            local_size = self._partition.local_size(self._comm.rank())
+            local_size = self._partition.local_size(max(self._rank, 0))

            # TODO(dlasalle): support 16-bit/half embeddings
            emb = th.empty(
@@ -187,15 +167,15 @@ class NodeEmbedding:  # NodeEmbedding
        device : th.device
            Target device to put the collected embeddings.
        """
-        if not self._comm or self._comm.size() == 1:
+        if not self._comm:
+            # embeddings are stored on the CPU
            emb = self._tensor[node_ids].to(device)
        else:
-            if self.world_size > 0:
-                emb = self._comm.sparse_all_to_all_pull(
+            # embeddings are stored on the GPU
+            # the following method also covers self._world_size = 0 or 1
+            emb = nccl.sparse_all_to_all_pull(
                node_ids, self._tensor, self._partition
            )
-            else:
-                emb = self._tensor[node_ids]
            emb = emb.to(device)
        if F.is_recording():
            emb = F.attach_grad(emb)
@@ -215,18 +195,6 @@ class NodeEmbedding:  # NodeEmbedding
        """
        return self._store

-    @property
-    def comm(self):
-        """Return dgl.cuda.nccl.Communicator for data
-        sharing across processes.
-
-        Returns
-        -------
-        dgl.cuda.nccl.Communicator
-            Communicator used for data sharing.
-        """
-        return self._comm
-
    @property
    def partition(self):
        """Return the partition identifying how the tensor is split across
@@ -361,7 +329,8 @@ class NodeEmbedding:  # NodeEmbedding
        if self._partition:
            idxs = F.copy_to(
                self._partition.get_local_indices(
-                    self._comm.rank(), ctx=F.context(self._tensor)
+                    max(self._rank, 0),
+                    ctx=F.context(self._tensor),
                ),
                F.context(values),
            )

--- a/python/dgl/optim/pytorch/sparse_optim.py
+++ b/python/dgl/optim/pytorch/sparse_optim.py
@@ -63,7 +63,6 @@ class SparseGradOptimizer(abc.ABC):
                ), "MultiGPU world_size for each embedding should be same."
        assert not self._rank is None
        assert not self._world_size is None
-        self._nccl_root_id = "SparseGradOptimizer.nccl_root_id"

    def step(self):
        """The step function.
@@ -74,7 +73,7 @@ class SparseGradOptimizer(abc.ABC):
        if self._first_step:
            for emb in self._params:
                for _, data in emb._trace:
-                    if data.grad.data.device.type == "cuda":
+                    if data.grad.device.type == "cuda":
                        # create a communicator
                        if self._device:
                            assert (
@@ -116,27 +115,7 @@ class SparseGradOptimizer(abc.ABC):
        """

    def _comm_setup(self):
-        # find a store to communicate the unique id through
-        if len(self._params) > 0:
-            store = self._params[0].store
-
-            if self._rank < 0:
-                self._comm = nccl.Communicator(1, 0, nccl.UniqueId())
-            else:
-                th.cuda.set_device(self._device)
-                if self._rank == 0:
-                    # root process broadcasts nccl id
-                    nccl_id = nccl.UniqueId()
-                    uid = str(nccl_id)
-                    store.set(self._nccl_root_id, uid)
-                else:
-                    uid = store.get(self._nccl_root_id)
-                    nccl_id = nccl.UniqueId(uid)
-                # needs to be set for nccl to work
-                self._comm = nccl.Communicator(
-                    self._world_size, self._rank, nccl_id
-                )
-                th.distributed.barrier()
+        self._comm = True

    def _shared_setup(self):
        for emb in self._params:
@@ -162,7 +141,6 @@ class SparseGradOptimizer(abc.ABC):
                self._opt_meta[emb_name] = opt_meta

    def _comm_step(self):
-        comm = self._comm
        with th.no_grad():
            idx_in = {}
            grad_in = {}
@@ -203,7 +181,7 @@ class SparseGradOptimizer(abc.ABC):
                (
                    idx_in[emb_name],
                    grad_in[emb_name],
-                ) = comm.sparse_all_to_all_push(idx, grad, partition=partition)
+                ) = nccl.sparse_all_to_all_push(idx, grad, partition=partition)
                if emb.partition:
                    # if the embedding is partitioned, map back to indexes
                    # into the local tensor

--- a/python/dgl/partition.py
+++ b/python/dgl/partition.py
@@ -592,5 +592,44 @@ class NDArrayPartition(object):
            )
        )

+    def generate_permutation(self, idxs):
+        """Produce a scheme that maps the given indices to separate partitions
+        and the counts of how many indices are in each partition.
+
+
+        Parameters
+        ----------
+        idxs: torch.Tensor.
+            A tensor with shape (`num_indices`,), representing global indices.
+
+        Return
+        ------
+        torch.Tensor.
+            A tensor with shape (`num_indices`,), representing the permutation
+            to re-order the indices by partition.
+        torch.Tensor.
+            A tensor with shape (`num_partition`,), representing the number of
+            indices per partition.
+
+        Examples
+        --------
+
+        >>> import torch
+        >>> from dgl.partition import NDArrayPartition
+        >>> part = NDArrayPartition(10, 2, mode="remainder")
+        >>> idx = torch.tensor([0, 2, 4, 5, 8, 8, 9], device="cuda:0")
+        >>> perm, splits_sum = part.generate_permutation(idx)
+        >>> perm
+        tensor([0, 1, 2, 4, 5, 3, 6], device='cuda:0')
+        >>> splits_sum
+        tensor([5, 2], device='cuda:0')
+        """
+        ret = _CAPI_DGLNDArrayPartitionGeneratePermutation(
+            self._partition, F.zerocopy_to_dgl_ndarray(idxs)
+        )
+        return F.zerocopy_from_dgl_ndarray(ret(0)), F.zerocopy_from_dgl_ndarray(
+            ret(1)
+        )
+

 _init_api("dgl.partition")
--- a/src/c_api_common.cc
+++ b/src/c_api_common.cc
 /**
 *  Copyright (c) 2018 by Contributors
- * @file c_runtime_api.cc
+ * @file c_api_common.cc
 * @brief DGL C API common implementations
 */
 #include "c_api_common.h"

--- a/src/partition/ndarray_partition.cc
+++ b/src/partition/ndarray_partition.cc
@@ -12,6 +12,7 @@
 #include <memory>
 #include <utility>

+#include "../c_api_common.h"
 #include "partition_op.h"

 using namespace dgl::runtime;
@@ -251,5 +252,15 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal")
      *rv = part->MapToGlobal(idxs, part_id);
    });

+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGeneratePermutation")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      NDArrayPartitionRef part = args[0];
+      IdArray idxs = args[1];
+
+      std::pair<IdArray, NDArray> part_perm = part->GeneratePermutation(idxs);
+      *rv =
+          ConvertNDArrayVectorToPackedFunc({part_perm.first, part_perm.second});
+    });
+
 }  // namespace partition
 }  // namespace dgl
--- a/src/runtime/cuda/nccl_api.cu
+++ b/src/runtime/cuda/nccl_api.cu
-/**
- *  Copyright (c) 2021-2022 by Contributors
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- * @file nccl_api.cu
- * @brief Implementation of wrapper around NCCL routines.
- */
-
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
-#include <dgl/array.h>
-#include <dgl/aten/array_ops.h>
-#include <dgl/packed_func_ext.h>
-#include <dgl/runtime/container.h>
-#include <dgl/runtime/device_api.h>
-#include <dgl/runtime/registry.h>
-
-#include <algorithm>
-#include <cmath>
-#include <iomanip>
-#include <limits>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "../../array/cuda/array_index_select.cuh"
-#include "../../array/cuda/dgl_cub.cuh"
-#include "../../partition/ndarray_partition.h"
-#include "../../runtime/workspace.h"
-#include "cuda_common.h"
-#include "nccl_api.h"
-
-#define NCCL_CALL(func)                                                   \
-  {                                                                       \
-    ncclResult_t result = func;                                           \
-    if (result != ncclSuccess) {                                          \
-      LOG(FATAL) << "NCCLError: " #func " failed with error: " << result; \
-    }                                                                     \
-  }
-
-namespace dgl {
-
-using namespace partition;
-
-namespace runtime {
-namespace cuda {
-
-namespace {
-
-#ifdef DGL_USE_NCCL
-
-template <typename T>
-ncclDataType_t NCCLType();
-template <>
-ncclDataType_t NCCLType<int32_t>() {
-  return ncclInt32;
-}
-template <>
-ncclDataType_t NCCLType<int64_t>() {
-  return ncclInt64;
-}
-template <>
-ncclDataType_t NCCLType<__half>() {
-  return ncclHalf;
-}
-template <>
-ncclDataType_t NCCLType<float>() {
-  return ncclFloat32;
-}
-template <>
-ncclDataType_t NCCLType<double>() {
-  return ncclFloat64;
-}
-
-#endif  // DGL_USE_NCCL
-
-template <typename IdType, typename DType>
-__global__ void _DualPermKernel(
-    const IdType* const in_idx, const DType* const in_value,
-    const IdType* const perm, const int64_t num_in, const int64_t num_feat,
-    IdType* const out_idx, DType* const out_value) {
-  // set index permutation
-  const int64_t tidx =
-      blockDim.x * static_cast<int64_t>(blockIdx.x) + threadIdx.x;
-  if (tidx < num_in) {
-    const IdType perm_idx = perm[tidx];
-    assert(perm_idx < num_in);
-    out_idx[tidx] = in_idx[perm_idx];
-  }
-
-  if (num_feat > 1) {
-    for (int d = 0; d < blockDim.x; ++d) {
-      const int64_t bidx = blockDim.x * static_cast<int64_t>(blockIdx.x) + d;
-      if (bidx < num_in) {
-        const IdType perm_idx = perm[bidx];
-        for (int64_t f = threadIdx.x; f < num_feat; f += blockDim.x) {
-          out_value[bidx * num_feat + f] = in_value[perm_idx * num_feat + f];
-        }
-      }
-    }
-  } else {
-    if (tidx < num_in) {
-      const IdType perm_idx = perm[tidx];
-      out_value[tidx] = in_value[perm_idx];
-    }
-  }
-}
-
-template <typename DType, typename IdType>
-__global__ void _InversePermKernel(
-    const DType* const array, const int64_t num_feat, int64_t length,
-    const IdType* const perm, DType* const out) {
-  int64_t in_row = blockIdx.x * blockDim.y + threadIdx.y;
-
-  const int64_t stride = blockDim.y * gridDim.x;
-
-  while (in_row < length) {
-    int64_t col = threadIdx.x;
-    const int64_t out_row = perm[in_row];
-    while (col < num_feat) {
-      out[out_row * num_feat + col] = array[in_row * num_feat + col];
-      col += blockDim.x;
-    }
-    in_row += stride;
-  }
-}
-
-template <typename IdType, typename DType>
-std::pair<IdArray, NDArray> SparsePush(
-    NCCLCommunicatorRef comm, IdArray in_idx, NDArray in_value,
-    NDArrayPartitionRef part) {
-  const auto& ctx = in_idx->ctx;
-  CHECK_EQ(ctx, in_value->ctx) << "Indices and values must be on the same "
-                                  "device";
-  auto device = DeviceAPI::Get(ctx);
-
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
-
-  CHECK_LE(in_idx->ndim, 1) << "The tensor of sending indices must be of "
-                               "dimension one (or empty).";
-  const int64_t num_in = in_idx->ndim > 0 ? in_idx->shape[0] : 0;
-
-  CHECK_EQ(num_in, in_value->ndim > 0 ? in_value->shape[0] : 0)
-      << "Leading dimension of indices (" << num_in
-      << ") must match "
-         "leading dimension of values ("
-      << (in_value->ndim > 0 ? in_value->shape[0] : 0) << ").";
-
-  int64_t num_feat = 1;
-  for (int d = 1; d < in_value->ndim; ++d) {
-    num_feat *= in_value->shape[d];
-  }
-
-  const int64_t comm_size = comm->size();
-
-  if (comm_size == 1) {
-    // nothing to do, just return original arrays
-    return std::pair<IdArray, NDArray>(in_idx, in_value);
-  }
-
-  std::pair<IdArray, NDArray> part_perm = part->GeneratePermutation(in_idx);
-  const IdType* const perm = static_cast<const IdType*>(part_perm.first->data);
-  const int64_t* const send_sum =
-      static_cast<const int64_t*>(part_perm.second->data);
-
-  Workspace<IdType> send_idx(device, ctx, num_in);
-  Workspace<DType> send_value(device, ctx, num_in * num_feat);
-
-  // permute the indices and values
-  if (num_in > 0) {
-    const dim3 block(256);
-    const dim3 grid((num_in + block.x - 1) / block.x);
-
-    CUDA_KERNEL_CALL(
-        _DualPermKernel, grid, block, 0, stream,
-        static_cast<const IdType*>(in_idx->data),
-        static_cast<const DType*>(in_value->data), perm, num_in, num_feat,
-        send_idx.get(), send_value.get());
-  }
-
-  // compute the prefix sum of the send values
-  Workspace<int64_t> send_prefix(device, ctx, comm_size + 1);
-  {
-    size_t prefix_workspace_size;
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        nullptr, prefix_workspace_size, send_sum, send_prefix.get(),
-        comm_size + 1, stream));
-
-    Workspace<void> prefix_workspace(device, ctx, prefix_workspace_size);
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        prefix_workspace.get(), prefix_workspace_size, send_sum,
-        send_prefix.get(), comm_size + 1, stream));
-  }
-
-  std::vector<int64_t> send_prefix_host(comm_size + 1);
-  // copy using the same stream (local current stream), no need to sync
-  device->CopyDataFromTo(
-      send_prefix.get(), 0, send_prefix_host.data(), 0,
-      send_prefix_host.size() * sizeof(*send_prefix.get()), ctx,
-      DGLContext{kDGLCPU, 0},
-      DGLDataType{kDGLInt, sizeof(*send_prefix.get()) * 8, 1});
-  send_prefix.free();
-
-  CHECK_EQ(send_prefix_host.back(), num_in)
-      << "Internal Error: "
-         "send_prefix_host.back() = "
-      << send_prefix_host.back() << ", and num_in = " << num_in;
-
-  // communicate the amount to send
-  Workspace<int64_t> recv_sum(device, ctx, comm_size + 1);
-  comm->AllToAll(send_sum, recv_sum.get(), 1, stream);
-
-  cudaEvent_t d2h;
-  CUDA_CALL(cudaEventCreate(&d2h));
-
-  // compute the prefix sum of the recv values
-  Workspace<int64_t> recv_prefix(device, ctx, comm_size + 1);
-  {
-    size_t prefix_workspace_size;
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        nullptr, prefix_workspace_size, recv_sum.get(), recv_prefix.get(),
-        comm_size + 1, stream));
-
-    Workspace<void> prefix_workspace(device, ctx, prefix_workspace_size);
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        prefix_workspace.get(), prefix_workspace_size, recv_sum.get(),
-        recv_prefix.get(), comm_size + 1, stream));
-  }
-  recv_sum.free();
-
-  // finally copy the prefixsum sum down to the host
-  std::vector<int64_t> recv_prefix_host(comm_size + 1);
-  // copy using the same stream (local current stream), no need to sync
-  device->CopyDataFromTo(
-      recv_prefix.get(), 0, recv_prefix_host.data(), 0,
-      recv_prefix_host.size() * sizeof(*recv_prefix.get()), ctx,
-      DGLContext{kDGLCPU, 0},
-      DGLDataType{kDGLInt, sizeof(*recv_prefix.get()) * 8, 1});
-  recv_prefix.free();
-
-  // use an event to track when copying is done
-  CUDA_CALL(cudaEventRecord(d2h, stream));
-
-  // allocate output space
-  CUDA_CALL(cudaEventSynchronize(d2h));
-  CUDA_CALL(cudaEventDestroy(d2h));
-
-  IdArray recv_idx =
-      aten::NewIdArray(recv_prefix_host.back(), ctx, sizeof(IdType) * 8);
-
-  std::vector<int64_t> value_shape(in_value->ndim, 0);
-  value_shape[0] = recv_prefix_host.back();
-  for (int d = 1; d < in_value->ndim; ++d) {
-    value_shape[d] = in_value->shape[d];
-  }
-  NDArray recv_value = NDArray::Empty(value_shape, in_value->dtype, ctx);
-
-  // send data
-  comm->SparseAllToAll(
-      send_idx.get(), send_value.get(), num_feat, send_prefix_host.data(),
-      static_cast<IdType*>(recv_idx->data),
-      static_cast<DType*>(recv_value->data), recv_prefix_host.data(), stream);
-
-  return std::pair<IdArray, NDArray>(recv_idx, recv_value);
-}
-
-template <typename IdType, typename DType>
-NDArray SparsePull(
-    NCCLCommunicatorRef comm, IdArray req_idx, NDArray local_tensor,
-    NDArrayPartitionRef part) {
-  const auto& ctx = req_idx->ctx;
-  CHECK_EQ(ctx, local_tensor->ctx) << "The request indices and set of local "
-                                      "values must be on the same device";
-  auto device = DeviceAPI::Get(ctx);
-
-  cudaStream_t stream = runtime::getCurrentCUDAStream();
-
-  CHECK_LE(req_idx->ndim, 1) << "The tensor of requested indices must be of "
-                                "dimension one (or empty).";
-  const int64_t num_in = req_idx->ndim > 0 ? req_idx->shape[0] : 0;
-  int64_t num_feat = 1;
-  for (int d = 1; d < local_tensor->ndim; ++d) {
-    num_feat *= local_tensor->shape[d];
-  }
-
-  const int64_t comm_size = comm->size();
-
-  if (comm_size == 1) {
-    // Just return index selection from current local_tensor
-    return aten::IndexSelect(local_tensor, req_idx);
-  }
-
-  // First we need to send our requests to other processors. This means
-  // re-ordering our index array to be contiguous among processors, and
-  // counting the number of indices we are sending each processor. For now,
-  // we assume a poorly partitioned graph, and that there exists the
-  // possibility that each processor could request data from this one.
-
-  // the buffer for us to re-order our requests in
-  Workspace<IdType> send_idx(device, ctx, num_in);
-
-  std::pair<IdArray, NDArray> part_perm = part->GeneratePermutation(req_idx);
-  const IdType* const perm = static_cast<const IdType*>(part_perm.first->data);
-  const int64_t* const send_sum =
-      static_cast<const int64_t*>(part_perm.second->data);
-
-  // permute requests
-  if (num_in > 0) {
-    const dim3 block(256);
-    const dim3 grid((num_in + block.x - 1) / block.x);
-
-    CUDA_KERNEL_CALL(
-        aten::impl::IndexSelectSingleKernel, grid, block, 0, stream,
-        static_cast<const IdType*>(req_idx->data), perm, num_in,
-        req_idx->shape[0], send_idx.get());
-  }
-
-  // compute the prefix sum of the indexes this process is requesting
-  Workspace<int64_t> request_prefix(device, ctx, comm_size + 1);
-  {
-    size_t prefix_workspace_size;
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        nullptr, prefix_workspace_size, send_sum, request_prefix.get(),
-        comm_size + 1, stream));
-
-    Workspace<void> prefix_workspace(device, ctx, prefix_workspace_size);
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        prefix_workspace.get(), prefix_workspace_size, send_sum,
-        request_prefix.get(), comm_size + 1, stream));
-  }
-
-  cudaEvent_t d2h;
-  CUDA_CALL(cudaEventCreate(&d2h));
-
-  std::vector<int64_t> request_prefix_host(comm_size + 1);
-  // copy using the same stream (local current stream), no need to sync
-  device->CopyDataFromTo(
-      request_prefix.get(), 0, request_prefix_host.data(), 0,
-      request_prefix_host.size() * sizeof(*request_prefix.get()), ctx,
-      DGLContext{kDGLCPU, 0},
-      DGLDataType{kDGLInt, sizeof(*request_prefix.get()) * 8, 1});
-  request_prefix.free();
-  CHECK_EQ(request_prefix_host.back(), num_in)
-      << "Internal Error: "
-         "request_prefix_host.back() = "
-      << request_prefix_host.back() << ", num_in = " << num_in;
-
-  // communicate the amount requested
-  Workspace<int64_t> recv_sum(device, ctx, comm_size + 1);
-  comm->AllToAll(send_sum, recv_sum.get(), 1, stream);
-
-  // compute the prefix sum of the requested indexes
-  Workspace<int64_t> response_prefix(device, ctx, comm_size + 1);
-  {
-    size_t prefix_workspace_size;
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        nullptr, prefix_workspace_size, recv_sum.get(), response_prefix.get(),
-        comm_size + 1, stream));
-
-    Workspace<void> prefix_workspace(device, ctx, prefix_workspace_size);
-    CUDA_CALL(cub::DeviceScan::ExclusiveSum(
-        prefix_workspace.get(), prefix_workspace_size, recv_sum.get(),
-        response_prefix.get(), comm_size + 1, stream));
-  }
-  recv_sum.free();
-
-  // finally copy the prefixsum sum down to the host
-  std::vector<int64_t> response_prefix_host(comm_size + 1);
-  // copy using the same stream (local current stream), no need to sync
-  device->CopyDataFromTo(
-      response_prefix.get(), 0, response_prefix_host.data(), 0,
-      response_prefix_host.size() * sizeof(*response_prefix.get()), ctx,
-      DGLContext{kDGLCPU, 0},
-      DGLDataType{kDGLInt, sizeof(*response_prefix.get()) * 8, 1});
-  response_prefix.free();
-
-  // use an event to track when copying is done
-  CUDA_CALL(cudaEventRecord(d2h, stream));
-
-  // allocate output space
-  CUDA_CALL(cudaEventSynchronize(d2h));
-  CUDA_CALL(cudaEventDestroy(d2h));
-
-  // gather requested indexes
-  IdArray recv_idx =
-      aten::NewIdArray(response_prefix_host.back(), ctx, sizeof(IdType) * 8);
-  comm->AllToAllV(
-      send_idx.get(), request_prefix_host.data(),
-      static_cast<IdType*>(recv_idx->data), response_prefix_host.data(),
-      stream);
-  send_idx.free();
-
-  // convert requested indices to local indices depending on partition
-  if (response_prefix_host.back() > 0) {
-    recv_idx = part->MapToLocal(recv_idx);
-  }
-
-  // and then index select them into place
-  Workspace<DType> filled_response_value(
-      device, ctx, response_prefix_host.back() * num_feat);
-  if (response_prefix_host.back() > 0) {
-    dim3 block(256, 1);
-    while (block.x >= 2 * num_feat) {
-      block.x /= 2;
-      block.y *= 2;
-    }
-    const dim3 grid((response_prefix_host.back() + block.y - 1) / block.y);
-
-    CUDA_KERNEL_CALL(
-        aten::impl::IndexSelectMultiKernel, grid, block, 0, stream,
-        static_cast<const DType*>(local_tensor->data), num_feat,
-        static_cast<IdType*>(recv_idx->data), response_prefix_host.back(),
-        local_tensor->shape[0], filled_response_value.get());
-  }
-
-  // we will collect recieved values in this array
-  std::vector<int64_t> value_shape(local_tensor->ndim, 0);
-  value_shape[0] = request_prefix_host.back();
-  for (int d = 1; d < local_tensor->ndim; ++d) {
-    value_shape[d] = local_tensor->shape[d];
-  }
-  Workspace<DType> filled_request_value(
-      device, ctx, request_prefix_host.back() * num_feat);
-
-  // multiply the prefixes by the number of features being sent
-  for (auto& v : request_prefix_host) {
-    v *= num_feat;
-  }
-  for (auto& v : response_prefix_host) {
-    v *= num_feat;
-  }
-
-  // send the values
-  comm->AllToAllV(
-      filled_response_value.get(), response_prefix_host.data(),
-      filled_request_value.get(), request_prefix_host.data(), stream);
-  filled_response_value.free();
-
-  // finally, we need to permute the values back into the requested order
-  NDArray result = NDArray::Empty(value_shape, local_tensor->dtype, ctx);
-  if (num_in > 0) {
-    dim3 block(256, 1);
-    while (block.x >= 2 * num_feat) {
-      block.x /= 2;
-      block.y *= 2;
-    }
-    const dim3 grid((num_in + block.y - 1) / block.y);
-
-    CUDA_KERNEL_CALL(
-        _InversePermKernel, grid, block, 0, stream, filled_request_value.get(),
-        num_feat, num_in, perm, static_cast<DType*>(result->data));
-  }
-
-  return result;
-}
-
-}  // namespace
-
-/* NCCLUniqueId **************************************************************/
-
-NCCLUniqueId::NCCLUniqueId() : id_() {
-#ifdef DGL_USE_NCCL
-  // this ID is unique to the process, not to each call of this function
-  NCCL_CALL(ncclGetUniqueId(&id_));
-#else
-  // when NCCL isn't enabled, use all zeros
-  std::fill(
-      id_.internal, id_.internal + NCCL_UNIQUE_ID_BYTES, static_cast<char>(0));
-#endif
-}
-
-ncclUniqueId NCCLUniqueId::Get() const { return id_; }
-
-std::string NCCLUniqueId::ToString() const {
-  std::ostringstream oss;
-
-  oss << std::hex;
-
-  for (size_t b = 0; b < NCCL_UNIQUE_ID_BYTES; ++b) {
-    const int num = static_cast<uint8_t>(id_.internal[b]);
-    oss << std::setw(2) << std::setfill('0') << num;
-  }
-
-  std::string result = oss.str();
-  CHECK_EQ(result.length(), NCCL_UNIQUE_ID_BYTES * 2)
-      << "Invalid NCCL ID format: '" << result << "'";
-
-  return result;
-}
-
-void NCCLUniqueId::FromString(const std::string& str) {
-  // must be exactly 256 hex characters
-  CHECK_EQ(str.length(), NCCL_UNIQUE_ID_BYTES * 2)
-      << "Invalid NCCL ID format: '" << str << "'";
-
-  for (size_t b = 0; b < NCCL_UNIQUE_ID_BYTES; ++b) {
-    id_.internal[b] = std::strtol(str.substr(b * 2, 2).c_str(), nullptr, 16);
-  }
-}
-
-/* NCCLCommunicator **********************************************************/
-
-NCCLCommunicator::NCCLCommunicator(
-    const int size, const int rank, ncclUniqueId id)
-    : comm_(), size_(size), rank_(rank) {
-  CHECK_LT(rank, size) << "The rank (" << rank
-                       << ") must be smaller than "
-                          "the size of the communicator ("
-                       << size << ").";
-  CHECK_GE(rank, 0) << "The rank (" << rank
-                    << ") must be greater than or "
-                       "equal to 0.";
-
-#ifdef DGL_USE_NCCL
-  NCCL_CALL(ncclCommInitRank(&comm_, size_, id, rank_));
-#else
-  CHECK_EQ(size, 1)
-      << "Cannot create a communicator of size " << size
-      << ". "
-         "To use a communicator size greater than 1, compile DGL with NCCL "
-         "support.";
-#endif
-}
-
-NCCLCommunicator::~NCCLCommunicator() {
-#ifdef DGL_USE_NCCL
-  ncclCommDestroy(comm_);
-#endif
-}
-
-ncclComm_t NCCLCommunicator::Get() { return comm_; }
-
-template <typename DType>
-void NCCLCommunicator::AllToAllV(
-    const DType* const send, const int64_t* const send_prefix,
-    DType* const recv, const int64_t* const recv_prefix, cudaStream_t stream) {
-#ifdef DGL_USE_NCCL
-  const ncclDataType_t type = NCCLType<DType>();
-
-  NCCL_CALL(ncclGroupStart());
-  for (int r = 0; r < size_; ++r) {
-    const int64_t send_size = send_prefix[r + 1] - send_prefix[r];
-    if (send_size > 0) {
-      NCCL_CALL(
-          ncclSend(send + send_prefix[r], send_size, type, r, comm_, stream));
-    }
-    const int64_t recv_size = recv_prefix[r + 1] - recv_prefix[r];
-    if (recv_size > 0) {
-      NCCL_CALL(
-          ncclRecv(recv + recv_prefix[r], recv_size, type, r, comm_, stream));
-    }
-  }
-  NCCL_CALL(ncclGroupEnd());
-#else
-  CHECK_EQ(send_prefix[1] - send_prefix[0], recv_prefix[1] - recv_prefix[0])
-      << "Send message size must equal receive message size.";
-
-  int dev_id;
-  CUDA_CALL(cudaGetDevice(&dev_id));
-  DGLContext ctx{kDGLCUDA, dev_id};
-
-  auto device = runtime::DeviceAPI::Get(ctx);
-  auto dtype = DGLDataTypeTraits<DType>::dtype;
-
-  // copy using the same stream (local current stream), no need to sync
-  device->CopyDataFromTo(
-      send, send_prefix[0], recv, recv_prefix[0],
-      sizeof(DType) * send_prefix[1] - send_prefix[0], ctx, ctx, dtype);
-#endif
-}
-
-template void NCCLCommunicator::AllToAllV<int32_t>(
-    const int32_t* const send, const int64_t* send_prefix, int32_t* const recv,
-    const int64_t* recv_prefix, cudaStream_t stream);
-template void NCCLCommunicator::AllToAllV<int64_t>(
-    const int64_t* const send, const int64_t* send_prefix, int64_t* const recv,
-    const int64_t* recv_prefix, cudaStream_t stream);
-template void NCCLCommunicator::AllToAllV<float>(
-    const float* const send, const int64_t* send_prefix, float* const recv,
-    const int64_t* recv_prefix, cudaStream_t stream);
-template void NCCLCommunicator::AllToAllV<__half>(
-    const __half* const send, const int64_t* send_prefix, __half* const recv,
-    const int64_t* recv_prefix, cudaStream_t stream);
-
-template <typename IdType>
-void NCCLCommunicator::AllToAll(
-    const IdType* const send, IdType* const recv, const int64_t count,
-    cudaStream_t stream) {
-#ifdef DGL_USE_NCCL
-  const ncclDataType_t type = NCCLType<IdType>();
-
-  NCCL_CALL(ncclGroupStart());
-  for (int r = 0; r < size_; ++r) {
-    NCCL_CALL(ncclSend(send + (r * count), count, type, r, comm_, stream));
-    NCCL_CALL(ncclRecv(recv + (r * count), count, type, r, comm_, stream));
-  }
-  NCCL_CALL(ncclGroupEnd());
-#else
-  int dev_id;
-  CUDA_CALL(cudaGetDevice(&dev_id));
-  DGLContext ctx{kDGLCUDA, dev_id};
-
-  auto device = runtime::DeviceAPI::Get(ctx);
-  auto dtype = DGLDataTypeTraits<IdType>::dtype;
-
-  // copy using the same stream (local current stream), no need to sync
-  device->CopyDataFromTo(send, 0, recv, 0, count, ctx, ctx, dtype);
-#endif
-}
-
-template void NCCLCommunicator::AllToAll<int32_t>(
-    const int32_t* const send, int32_t* const recv, const int64_t count,
-    cudaStream_t stream);
-template void NCCLCommunicator::AllToAll<int64_t>(
-    const int64_t* const send, int64_t* const recv, const int64_t count,
-    cudaStream_t stream);
-
-template <typename IdType, typename DType>
-void NCCLCommunicator::SparseAllToAll(
-    const IdType* const send_idx, const DType* const send_value,
-    const int64_t num_feat, const int64_t* const send_prefix,
-    IdType* const recv_idx, DType* const recv_value,
-    const int64_t* const recv_prefix, cudaStream_t stream) {
-  // idxs
-  AllToAllV(send_idx, send_prefix, recv_idx, recv_prefix, stream);
-
-  // scale prefixes by number of features
-  std::vector<int64_t> value_send_prefix(size_ + 1);
-  for (int r = 0; r < size_ + 1; ++r) {
-    value_send_prefix[r] = send_prefix[r] * num_feat;
-  }
-  std::vector<int64_t> value_recv_prefix(size_ + 1);
-  for (int r = 0; r < size_ + 1; ++r) {
-    value_recv_prefix[r] = recv_prefix[r] * num_feat;
-  }
-  AllToAllV(
-      send_value, value_send_prefix.data(), recv_value,
-      value_recv_prefix.data(), stream);
-}
-
-template void NCCLCommunicator::SparseAllToAll<int32_t, __half>(
-    const int32_t* const send_idx, const __half* const send_value,
-    const int64_t num_feat, const int64_t* const send_prefix,
-    int32_t* const recv_idx, __half* const recv_value,
-    const int64_t* const recv_prefix, cudaStream_t stream);
-template void NCCLCommunicator::SparseAllToAll<int64_t, __half>(
-    const int64_t* const send_idx, const __half* const send_value,
-    const int64_t num_feat, const int64_t* const send_prefix,
-    int64_t* const recv_idx, __half* const recv_value,
-    const int64_t* const recv_prefix, cudaStream_t stream);
-
-int NCCLCommunicator::size() const { return size_; }
-
-int NCCLCommunicator::rank() const { return rank_; }
-
-/* CAPI **********************************************************************/
-
-DGL_REGISTER_GLOBAL("cuda.nccl._CAPI_DGLNCCLGetUniqueId")
-    .set_body([](DGLArgs args, DGLRetValue* rv) {
-      *rv = NCCLUniqueIdRef(std::make_shared<NCCLUniqueId>());
-    });
-
-DGL_REGISTER_GLOBAL("cuda.nccl._CAPI_DGLNCCLUniqueIdToString")
-    .set_body([](DGLArgs args, DGLRetValue* rv) {
-      NCCLUniqueIdRef idObj = args[0];
-      *rv = idObj->ToString();
-    });
-
-DGL_REGISTER_GLOBAL("cuda.nccl._CAPI_DGLNCCLUniqueIdFromString")
-    .set_body([](DGLArgs args, DGLRetValue* rv) {
-      const std::string str = args[0];
-
-      NCCLUniqueIdRef ref(std::make_shared<NCCLUniqueId>());
-      ref->FromString(str);
-      *rv = ref;
-    });
-
-DGL_REGISTER_GLOBAL("cuda.nccl._CAPI_DGLNCCLCreateComm")
-    .set_body([](DGLArgs args, DGLRetValue* rv) {
-      const int size = args[0];
-      const int rank = args[1];
-      NCCLUniqueIdRef idObj = args[2];
-
-      *rv = NCCLCommunicatorRef(
-          std::make_shared<NCCLCommunicator>(size, rank, idObj->Get()));
-    });
-
-DGL_REGISTER_GLOBAL("cuda.nccl._CAPI_DGLNCCLSparseAllToAllPush")
-    .set_body([](DGLArgs args, DGLRetValue* rv) {
-      NCCLCommunicatorRef comm = args[0];
-      IdArray in_idx = args[1];
-      NDArray in_values = args[2];
-      NDArrayPartitionRef part = args[3];
-
-      List<ObjectRef> ret;
-      ATEN_ID_TYPE_SWITCH(in_idx->dtype, IdType, {
-        ATEN_DTYPE_SWITCH(in_values->dtype, DType, "values", {
-          auto result =
-              SparsePush<IdType, DType>(comm, in_idx, in_values, part);
-          ret.push_back(Value(MakeValue(result.first)));
-          ret.push_back(Value(MakeValue(result.second)));
-        });
-      });
-
-      *rv = ret;
-    });
-
-DGL_REGISTER_GLOBAL("cuda.nccl._CAPI_DGLNCCLSparseAllToAllPull")
-    .set_body([](DGLArgs args, DGLRetValue* rv) {
-      NCCLCommunicatorRef comm = args[0];
-      // the indexes this process is requesting from others
-      IdArray req_idx = args[1];
-
-      // the tensor this process has to fulfill other requests
-      NDArray tensor = args[2];
-      NDArrayPartitionRef part = args[3];
-
-      ATEN_ID_TYPE_SWITCH(req_idx->dtype, IdType, {
-        ATEN_DTYPE_SWITCH(tensor->dtype, DType, "values", {
-          *rv = SparsePull<IdType, DType>(comm, req_idx, tensor, part);
-        });
-      });
-    });
-
-DGL_REGISTER_GLOBAL("cuda.nccl._CAPI_DGLNCCLHasSupport")
-    .set_body([](DGLArgs args, DGLRetValue* rv) {
-#ifndef DGL_USE_NCCL
-      return false;
-#else
-      return true;
-#endif
-    });
-
-}  // namespace cuda
-}  // namespace runtime
-}  // namespace dgl
--- a/src/runtime/cuda/nccl_api.h
+++ b/src/runtime/cuda/nccl_api.h
-/**
- *  Copyright (c) 2021-2022 by Contributors
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- * @file nccl_api.h
- * @brief Wrapper around NCCL routines.
- */
-
-#ifndef DGL_RUNTIME_CUDA_NCCL_API_H_
-#define DGL_RUNTIME_CUDA_NCCL_API_H_
-
-#ifdef DGL_USE_NCCL
-#include "nccl.h"
-#else
-// if not compiling with NCCL, this class will only support communicators of
-// size 1.
-#define NCCL_UNIQUE_ID_BYTES 128
-typedef struct {
-  char internal[NCCL_UNIQUE_ID_BYTES];
-} ncclUniqueId;
-typedef int ncclComm_t;
-#endif
-
-#include <dgl/runtime/object.h>
-
-#include <string>
-
-namespace dgl {
-namespace runtime {
-namespace cuda {
-
-class NCCLUniqueId : public runtime::Object {
- public:
-  NCCLUniqueId();
-
-  static constexpr const char* _type_key = "cuda.NCCLUniqueId";
-  DGL_DECLARE_OBJECT_TYPE_INFO(NCCLUniqueId, Object);
-
-  ncclUniqueId Get() const;
-
-  std::string ToString() const;
-
-  void FromString(const std::string& str);
-
- private:
-  ncclUniqueId id_;
-};
-
-DGL_DEFINE_OBJECT_REF(NCCLUniqueIdRef, NCCLUniqueId);
-
-class NCCLCommunicator : public runtime::Object {
- public:
-  NCCLCommunicator(int size, int rank, ncclUniqueId id);
-
-  ~NCCLCommunicator();
-
-  // disable copying
-  NCCLCommunicator(const NCCLCommunicator& other) = delete;
-  NCCLCommunicator& operator=(const NCCLCommunicator& other);
-
-  ncclComm_t Get();
-
-  /**
-   * @brief Perform an all-to-all communication.
-   *
-   * @param send The continous array of data to send.
-   * @param recv The continous array of data to recieve.
-   * @param count The size of data to send to each rank.
-   * @param stream The stream to operate on.
-   */
-  template <typename IdType>
-  void AllToAll(
-      const IdType* send, IdType* recv, int64_t count, cudaStream_t stream);
-
-  /**
-   * @brief Perform an all-to-all variable sized communication.
-   *
-   * @tparam DType The type of value to send.
-   * @param send The arrays of data to send.
-   * @param send_prefix The prefix of each array to send.
-   * @param recv The arrays of data to recieve.
-   * @param recv_prefix The prefix of each array to recieve.
-   * @param type The type of data to send.
-   * @param stream The stream to operate on.
-   */
-  template <typename DType>
-  void AllToAllV(
-      const DType* const send, const int64_t* send_prefix, DType* const recv,
-      const int64_t* recv_prefix, cudaStream_t stream);
-
-  /**
-   * @brief Perform an all-to-all with sparse data (idx and value pairs). By
-   * necessity, the sizes of each message are variable.
-   *
-   * @tparam IdType The type of index.
-   * @tparam DType The type of value.
-   * @param send_idx The set of indexes to send on the device.
-   * @param send_value The set of values to send on the device.
-   * @param num_feat The number of values per index.
-   * @param send_prefix The exclusive prefix sum of elements to send on the
-   * host.
-   * @param recv_idx The set of indexes to recieve on the device.
-   * @param recv_value The set of values to recieve on the device.
-   * @param recv_prefix The exclusive prefix sum of the number of elements to
-   * recieve on the host.
-   * @param stream The stream to communicate on.
-   */
-  template <typename IdType, typename DType>
-  void SparseAllToAll(
-      const IdType* send_idx, const DType* send_value, const int64_t num_feat,
-      const int64_t* send_prefix, IdType* recv_idx, DType* recv_value,
-      const int64_t* recv_prefix, cudaStream_t stream);
-
-  int size() const;
-
-  int rank() const;
-
-  static constexpr const char* _type_key = "cuda.NCCLCommunicator";
-  DGL_DECLARE_OBJECT_TYPE_INFO(NCCLCommunicator, Object);
-
- private:
-  ncclComm_t comm_;
-  int size_;
-  int rank_;
-};
-
-DGL_DEFINE_OBJECT_REF(NCCLCommunicatorRef, NCCLCommunicator);
-
-}  // namespace cuda
-}  // namespace runtime
-}  // namespace dgl
-
-#endif  // DGL_RUNTIME_CUDA_NCCL_API_H_
--- a/tests/python/common/test_partition.py
+++ b/tests/python/common/test_partition.py
@@ -24,11 +24,13 @@ def test_get_node_partition_from_book(idtype):
    assert partition.num_parts() == 3
    assert partition.array_size() == 11

+    # Test map_to_local
    test_ids = F.copy_to(F.tensor([0, 2, 6, 7, 10], dtype=idtype), F.ctx())
    act_ids = partition.map_to_local(test_ids)
    exp_ids = F.copy_to(F.tensor([0, 2, 0, 1, 4], dtype=idtype), F.ctx())
    assert F.array_equal(act_ids, exp_ids)

+    # Test map_to_global
    test_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
    act_ids = partition.map_to_global(test_ids, 0)
    exp_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
@@ -43,3 +45,11 @@ def test_get_node_partition_from_book(idtype):
    act_ids = partition.map_to_global(test_ids, 2)
    exp_ids = F.copy_to(F.tensor([6, 7, 10], dtype=idtype), F.ctx())
    assert F.array_equal(act_ids, exp_ids)
+
+    # Test generate_permutation
+    test_ids = F.copy_to(F.tensor([6, 0, 7, 2, 10], dtype=idtype), F.ctx())
+    perm, split_sum = partition.generate_permutation(test_ids)
+    exp_perm = F.copy_to(F.tensor([1, 3, 0, 2, 4], dtype=idtype), F.ctx())
+    exp_sum = F.copy_to(F.tensor([2, 0, 3]), F.ctx())
+    assert F.array_equal(perm, exp_perm)
+    assert F.array_equal(split_sum, exp_sum)
--- a/tests/python/common/cuda/test_nccl.py
+++ b/tests/python/common/cuda/test_nccl.py
 import unittest

 import backend as F
+import torch
+import torch.distributed as dist

 from dgl.cuda import nccl
 from dgl.partition import NDArrayPartition


-def gen_test_id():
-    return "{:0256x}".format(78236728318467363)
-
-
-@unittest.skipIf(
-    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
-)
-def test_nccl_id():
-    nccl_id = nccl.UniqueId()
-    text = str(nccl_id)
-    nccl_id2 = nccl.UniqueId(id_str=text)
-
-    assert nccl_id == nccl_id2
-
-    nccl_id2 = nccl.UniqueId(gen_test_id())
-
-    assert nccl_id2 != nccl_id
-
-    nccl_id3 = nccl.UniqueId(str(nccl_id2))
-
-    assert nccl_id2 == nccl_id3
-
-
 @unittest.skipIf(
    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
 )
 def test_nccl_sparse_push_single_remainder():
-    nccl_id = nccl.UniqueId()
-    comm = nccl.Communicator(1, 0, nccl_id)
+    torch.cuda.set_device("cuda:0")
+    dist.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:12345",
+        world_size=1,
+        rank=0,
+    )

    index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
    value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)

    part = NDArrayPartition(10000, 1, "remainder")

-    ri, rv = comm.sparse_all_to_all_push(index, value, part)
+    ri, rv = nccl.sparse_all_to_all_push(index, value, part)
    assert F.array_equal(ri, index)
    assert F.array_equal(rv, value)

+    dist.destroy_process_group()
+

 @unittest.skipIf(
    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
 )
 def test_nccl_sparse_pull_single_remainder():
-    nccl_id = nccl.UniqueId()
-    comm = nccl.Communicator(1, 0, nccl_id)
+    torch.cuda.set_device("cuda:0")
+    dist.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:12345",
+        world_size=1,
+        rank=0,
+    )

    req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
    value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)

    part = NDArrayPartition(100000, 1, "remainder")

-    rv = comm.sparse_all_to_all_pull(req_index, value, part)
+    rv = nccl.sparse_all_to_all_pull(req_index, value, part)
    exp_rv = F.gather_row(value, req_index)
    assert F.array_equal(rv, exp_rv)

+    dist.destroy_process_group()
+

 @unittest.skipIf(
    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
 )
 def test_nccl_sparse_push_single_range():
-    nccl_id = nccl.UniqueId()
-    comm = nccl.Communicator(1, 0, nccl_id)
+    torch.cuda.set_device("cuda:0")
+    dist.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:12345",
+        world_size=1,
+        rank=0,
+    )

    index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
    value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)
@@ -78,17 +76,24 @@ def test_nccl_sparse_push_single_range():
    )
    part = NDArrayPartition(10000, 1, "range", part_ranges=part_ranges)

-    ri, rv = comm.sparse_all_to_all_push(index, value, part)
+    ri, rv = nccl.sparse_all_to_all_push(index, value, part)
    assert F.array_equal(ri, index)
    assert F.array_equal(rv, value)

+    dist.destroy_process_group()
+

 @unittest.skipIf(
    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
 )
 def test_nccl_sparse_pull_single_range():
-    nccl_id = nccl.UniqueId()
-    comm = nccl.Communicator(1, 0, nccl_id)
+    torch.cuda.set_device("cuda:0")
+    dist.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:12345",
+        world_size=1,
+        rank=0,
+    )

    req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
    value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)
@@ -98,21 +103,15 @@ def test_nccl_sparse_pull_single_range():
    )
    part = NDArrayPartition(100000, 1, "range", part_ranges=part_ranges)

-    rv = comm.sparse_all_to_all_pull(req_index, value, part)
+    rv = nccl.sparse_all_to_all_pull(req_index, value, part)
    exp_rv = F.gather_row(value, req_index)
    assert F.array_equal(rv, exp_rv)

-
-@unittest.skipIf(
-    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
-)
-def test_nccl_support():
-    # this is just a smoke test, as we don't have any other way to know
-    # if NCCL support is compiled in right now.
-    nccl.is_supported()
+    dist.destroy_process_group()


 if __name__ == "__main__":
-    test_nccl_id()
-    test_nccl_sparse_push_single()
-    test_nccl_sparse_pull_single()
+    test_nccl_sparse_push_single_remainder()
+    test_nccl_sparse_pull_single_remainder()
+    test_nccl_sparse_push_single_range()
+    test_nccl_sparse_pull_single_range()
--- a/tests/scripts/build_dgl.sh
+++ b/tests/scripts/build_dgl.sh
@@ -28,7 +28,7 @@ if [[ $arch == *"x86"* ]]; then
 fi

 if [[ $1 != "cpu" ]]; then
-    CMAKE_VARS="-DUSE_CUDA=ON -DUSE_NCCL=ON $CMAKE_VARS"
+    CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
 fi

 if [ -d build ]; then

--- a/nccl @ e11238b3
+++ b/nccl @ e11238b3
-Subproject commit e11238b3029795d33f958b5868d47c90c4f22628