[Refactor] Replace third_party/nccl with PyTorch's NCCL backend (#4989)

* expose GeneratePermutation * add sparse_all_to_all_push * add sparse_all_to_all_pull * add unit test * handle world_size=1 * remove python nccl wrapper * remove the nccl dependency * use pinned memory to speedup D2H copy * fix lint * resolve comments * fix lint * fix ut * resolve comments

[Refactor] Replace third_party/nccl with PyTorch's NCCL backend (#4989)
* expose GeneratePermutation * add sparse_all_to_all_push * add sparse_all_to_all_pull * add unit test * handle world_size=1 * remove python nccl wrapper * remove the nccl dependency * use pinned memory to speedup D2H copy * fix lint * resolve comments * fix lint * fix ut * resolve comments
8d5d8962 · Xin Yao · GitHub · b1ec112e · 8d5d8962 · 8d5d8962
Unverified Commit 8d5d8962 authored Mar 08, 2023 by Xin Yao Committed by GitHub Mar 08, 2023
17 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -22,9 +22,6 @@
 [submodule "third_party/nanoflann"]
 	path = third_party/nanoflann
 	url = https://github.com/jlblancoc/nanoflann
-[submodule "third_party/nccl"]
-	path = third_party/nccl
-	url = https://github.com/nvidia/nccl
 [submodule "third_party/libxsmm"]
 	path = third_party/libxsmm
 	url = https://github.com/hfp/libxsmm.git

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,8 +23,6 @@ endif()
 # and add set(OPTION VALUE) to override these build options.
 # Alernatively, use cmake -DOPTION=VALUE through command-line.
 dgl_option(USE_CUDA "Build with CUDA" OFF)
-dgl_option(USE_NCCL "Build with NCCL support" OFF)
-dgl_option(USE_SYSTEM_NCCL "Build using system's NCCL library" OFF)
 dgl_option(USE_OPENMP "Build with OpenMP" ON)
 dgl_option(USE_AVX "Build with AVX optimization" OFF)
 dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON)
@@ -171,25 +169,7 @@ list(APPEND DGL_SRC ${DGL_RPC_SRC})
 if(USE_CUDA)
  dgl_config_cuda(DGL_CUDA_SRC)
  list(APPEND DGL_SRC ${DGL_CUDA_SRC})
-  if(USE_NCCL)
-    add_definitions(-DDGL_USE_NCCL)
-    if (USE_SYSTEM_NCCL)
-      include(cmake/util/FindNccl.cmake)
-      include_directories(${NCCL_INCLUDE_DIR})
-    else()
-      include(cmake/modules/NCCL.cmake)
-      cuda_include_directories(BEFORE ${NCCL_INCLUDE_DIR})
-    endif()
-  endif(USE_NCCL)
-  list(APPEND DGL_LINKER_LIBS ${NCCL_LIBRARY})
-endif(USE_CUDA)
-if(USE_CUDA)
  cuda_add_library(dgl SHARED ${DGL_SRC})
-  if (USE_NCCL AND NOT USE_SYSTEM_NCCL)
-    add_dependencies(dgl nccl_external)
-  endif()
 else(USE_CUDA)
  add_library(dgl SHARED ${DGL_SRC})
 endif(USE_CUDA)

--- a/cmake/modules/NCCL.cmake
+++ b/cmake/modules/NCCL.cmake
-include(ExternalProject)
-# set path to submodule
-set(NCCL_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/nccl")
-# NCCL doesn't have CMAKE, so build externally
-ExternalProject_Add(nccl_external
-  SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl
-  BUILD_IN_SOURCE 1
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND
-    env
-    make
-    "src.build"
-    "-j"
-    "BUILDDIR=${NCCL_BUILD_DIR}"
-  BUILD_BYPRODUCTS "${NCCL_BUILD_DIR}/lib/libnccl_static.a"
-  INSTALL_COMMAND ""
-  )
-# set output variables
-set(NCCL_FOUND TRUE)
-set(NCCL_LIBRARY "${NCCL_BUILD_DIR}/lib/libnccl_static.a")
-set(NCCL_INCLUDE_DIR "${NCCL_BUILD_DIR}/include")
--- a/cmake/util/FindNccl.cmake
+++ b/cmake/util/FindNccl.cmake
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Tries to find NCCL headers and libraries.
-#
-# Usage of this module as follows:
-#
-#  find_package(NCCL)
-#
-# Variables used by this module, they can change the default behaviour and need
-# to be set before calling find_package:
-#
-#  NCCL_ROOT - When set, this path is inspected instead of standard library
-#              locations as the root of the NCCL installation.
-#              The environment variable NCCL_ROOT overrides this variable.
-#
-# This module defines
-#  Nccl_FOUND, whether nccl has been found
-#  NCCL_INCLUDE_DIR, directory containing header
-#  NCCL_LIBRARY, directory containing nccl library
-#  NCCL_LIB_NAME, nccl library name
-#  USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the 
-#                     location of the nccl library. This would disable
-#                     switching between static and shared.
-#
-# This module assumes that the user has already called find_package(CUDA)
-#
-# This file is from https://github.com/dmlc/xgboost, with modifications to
-# check the version.
-if (NCCL_LIBRARY)
-  if(NOT USE_NCCL_LIB_PATH)
-    # Don't cache NCCL_LIBRARY to enable switching between static and shared.
-    unset(NCCL_LIBRARY CACHE)
-  endif(NOT USE_NCCL_LIB_PATH)
-endif()
-if (BUILD_WITH_SHARED_NCCL)
-  # libnccl.so
-  set(NCCL_LIB_NAME nccl)
-else ()
-  # libnccl_static.a
-  set(NCCL_LIB_NAME nccl_static)
-endif (BUILD_WITH_SHARED_NCCL)
-find_path(NCCL_INCLUDE_DIR
-  NAMES nccl.h
-  PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
-# make sure it has point to point support
-file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_CODE REGEX "^#define[ \t]+NCCL_VERSION_CODE[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
-string(REGEX REPLACE "^.*NCCL_VERSION_CODE[ \t]+([0-9]+).*$" "\\1" NCCL_VERSION "${NCCL_VERSION_CODE}")
-find_library(NCCL_LIBRARY
-  NAMES ${NCCL_LIB_NAME}
-  PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
-if ("${NCCL_VERSION}" LESS "2700")
-  message(FATAL_ERROR "Require nccl >= 2700, but found ${NCCL_LIBRARY}==${NCCL_VERSION}")
-else()
-  message(STATUS "Using nccl library: ${NCCL_LIBRARY} ${NCCL_VERSION}")
-endif()
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Nccl DEFAULT_MSG
-                                  NCCL_INCLUDE_DIR NCCL_LIBRARY)
-mark_as_advanced(
-  NCCL_INCLUDE_DIR
-  NCCL_LIBRARY
-)
--- a/python/dgl/cuda/__init__.py
+++ b/python/dgl/cuda/__init__.py
 """ CUDA wrappers """
-from . import nccl
+from .. import backend as F
+if F.get_preferred_backend() == "pytorch":
+    from . import nccl
--- a/python/dgl/cuda/nccl.py
+++ b/python/dgl/cuda/nccl.py
-"""API creating NCCL communicators."""
+"""API wrapping NCCL primitives."""
-from .. import backend as F
+import torch
-from .._ffi.function import _init_api
+import torch.distributed as dist
-_COMM_MODES_MAP = {"remainder": 0}
+def sparse_all_to_all_push(idx, value, partition):
-class UniqueId(object):
-    """Class for allowing python code to create and communicate NCCL Unique
-    IDs, needed for creating communicators.
-    """
-    def __init__(self, id_str=None):
-        """Create an object reference the current NCCL unique id."""
-        if id_str:
-            if isinstance(id_str, bytes):
-                id_str = id_str.decode("utf-8")
-            self._handle = _CAPI_DGLNCCLUniqueIdFromString(id_str)
-        else:
-            self._handle = _CAPI_DGLNCCLGetUniqueId()
-    def get(self):
-        """Get the C-handle for this object."""
-        return self._handle
-    def __str__(self):
-        return _CAPI_DGLNCCLUniqueIdToString(self._handle)
-    def __repr__(self):
-        return "UniqueId[{}]".format(str(self))
-    def __eq__(self, other):
-        return str(self) == str(other)
-class Communicator(object):
-    """High-level wrapper for NCCL communication."""
-    def __init__(self, size, rank, unique_id):
-        """Create a new NCCL communicator.
-        Parameters
-        ----------
-        size : int
-            The number of processes in the communicator.
-        rank : int
-            The rank of the current process in the communicator.
-        unique_id : NCCLUniqueId
-            The unique id of the root process (rank=0).
-        Examples
-        --------
-        >>> from dgl.cuda.nccl import Communicator, UniqueId
-        The root process will generate a unique NCCL id and communicate it
-        to the other processes.
-        >>> uid = UniqueId()
-        >>> store.set('nccl_root_id', str(uid))
-        And all other processes create unique ids from the root processes.
-        >>> uid = UniqueId(store.get('nccl_root_id'))
-        Then, all processes should create the communicator.
-        >>> comm = Communicator(world_size, rank, uid)
-        """
-        assert rank < size, (
-            "The rank of a process must be less than the "
-            "size of the communicator."
-        )
-        self._handle = _CAPI_DGLNCCLCreateComm(size, rank, unique_id.get())
-        self._rank = rank
-        self._size = size
-    def sparse_all_to_all_push(self, idx, value, partition):
    """Perform an all-to-all-v operation, where by all processors send out
    a set of indices and corresponding values. Indices and values,
    corresponding to the current process, will copied into the output
    arrays.
+    Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
    Parameters
    ----------
-        idx : tensor
+    idx : torch.Tensor
        The 1D set of indices to send to other processors.
-        value : tensor
+    value : torch.Tensor
        The multi-dimension set of values to send to other processors.
        The first dimension must match that of `idx`.
    partition : NDArrayPartition
@@ -95,9 +25,9 @@ class Communicator(object):
    Returns
    -------
-        tensor
+    torch.Tensor
        The 1D tensor of the recieved indices.
-        tensor
+    torch.Tensor
        The set of recieved values.
    Examples
@@ -108,7 +38,7 @@ class Communicator(object):
    striped across processes can be generated via:
    >>> from dgl.partition import NDArrayPartition
-        >>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' )
+    >>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')
    With this partition, each processor can send values to be associatd
    with vertices in the graph. So if we have an array `global_idxs` of all of
@@ -116,7 +46,7 @@ class Communicator(object):
    `global_values` containing the new values associated with the neighbors,
    we communicate them to the own processes via:
-        >>> my_idxs, my_values = comm.sparse_all_to_all_push(global_idxs, global_values, part)
+    >>> my_idxs, my_values = nccl.sparse_all_to_all_push(global_idxs, global_values, part)
    This communication pattern is common when communicating gradient
    updates for node embeddings.
@@ -128,26 +58,54 @@ class Communicator(object):
    indices for processe 0 of '[0, 8, 10, 0, 2, 4, 8, 8]', and for
    process 1 of '[3, 9, 5, 9]'.
    """
-        out_idx, out_value = _CAPI_DGLNCCLSparseAllToAllPush(
+    if not dist.is_initialized() or dist.get_world_size() == 1:
-            self.get(),
+        return idx, value
-            F.zerocopy_to_dgl_ndarray(idx),
+    assert (
-            F.zerocopy_to_dgl_ndarray(value),
+        dist.get_backend() == "nccl"
-            partition.get(),
+    ), "requires NCCL backend to communicate CUDA tensors."
-        )
-        return (
+    perm, send_splits = partition.generate_permutation(idx)
-            F.zerocopy_from_dgl_ndarray(out_idx),
+    perm = perm.long()
-            F.zerocopy_from_dgl_ndarray(out_value),
+    # Get receive splits.
+    recv_splits = torch.empty_like(send_splits)
+    dist.all_to_all_single(recv_splits, send_splits)
+    # Use pinned memory to speedup D2H copy.
+    recv_splits = recv_splits.to("cpu", non_blocking=True)
+    send_splits = send_splits.to("cpu", non_blocking=True)
+    send_idx = idx[perm]
+    send_value = value[perm]
+    # Wait D2H copy finish.
+    torch.cuda.current_stream().synchronize()
+    recv_sum = recv_splits.sum()
+    recv_splits = recv_splits.tolist()
+    send_splits = send_splits.tolist()
+    # Send idx.
+    recv_idx = torch.empty((recv_sum,), dtype=idx.dtype, device=idx.device)
+    dist.all_to_all_single(recv_idx, send_idx, recv_splits, send_splits)
+    # Send value.
+    recv_value = torch.empty(
+        (recv_sum, *value.shape[1:]), dtype=value.dtype, device=value.device
    )
+    dist.all_to_all_single(recv_value, send_value, recv_splits, send_splits)
+    return recv_idx, recv_value
-    def sparse_all_to_all_pull(self, req_idx, value, partition):
+def sparse_all_to_all_pull(req_idx, value, partition):
    """Perform an all-to-all-v operation, where by all processors request
    the values corresponding to their set of indices.
+    Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
    Parameters
    ----------
-        req_idx : IdArray
+    req_idx : torch.Tensor
        The set of indices this processor is requesting.
-        value : NDArray
+    value : torch.Tensor
        The multi-dimension set of values that can be requested from
        this processor.
    partition : NDArrayPartition
@@ -156,7 +114,7 @@ class Communicator(object):
    Returns
    -------
-        tensor
+    torch.Tensor
        The set of recieved values, corresponding to `req_idx`.
    Examples
@@ -167,7 +125,7 @@ class Communicator(object):
    striped across processes can be generated via:
    >>> from dgl.partition import NDArrayPartition
-        >>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' )
+    >>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')
    With this partition, each processor can request values/features
    associated with vertices in the graph. So in the case where we have
@@ -175,7 +133,7 @@ class Communicator(object):
    has a tensor 'node_feat' storing the features of nodes it owns in
    the partition, the features can be requested via:
-        >>> nbr_values = comm.sparse_all_to_all_pull(nbr_idxs, node_feat, part)
+    >>> nbr_values = nccl.sparse_all_to_all_pull(nbr_idxs, node_feat, part)
    Then two the arrays 'nbr_idxs' and 'nbr_values' forms the sparse
    set of features, where 'nbr_idxs[i]' is the global node id, and
@@ -183,48 +141,49 @@ class Communicator(object):
    communication pattern is useful for node features or node
    embeddings.
    """
-        out_value = _CAPI_DGLNCCLSparseAllToAllPull(
+    if not dist.is_initialized() or dist.get_world_size() == 1:
-            self.get(),
+        return value[req_idx.long()]
-            F.zerocopy_to_dgl_ndarray(req_idx),
+    assert (
-            F.zerocopy_to_dgl_ndarray(value),
+        dist.get_backend() == "nccl"
-            partition.get(),
+    ), "requires NCCL backend to communicate CUDA tensors."
+    perm, req_splits = partition.generate_permutation(req_idx)
+    perm = perm.long()
+    # Get response splits.
+    resp_splits = torch.empty_like(req_splits)
+    dist.all_to_all_single(resp_splits, req_splits)
+    # Use pinned memory to speedup D2H copy.
+    resp_splits = resp_splits.to("cpu", non_blocking=True)
+    req_splits = req_splits.to("cpu", non_blocking=True)
+    req_idx = req_idx[perm]
+    # Wait D2H copy finish.
+    torch.cuda.current_stream().synchronize()
+    resp_sum = resp_splits.sum()
+    resp_splits = resp_splits.tolist()
+    req_splits = req_splits.tolist()
+    # Gather requested indices.
+    resp_idx = torch.empty(
+        (resp_sum,), dtype=req_idx.dtype, device=req_idx.device
    )
-        return F.zerocopy_from_dgl_ndarray(out_value)
+    dist.all_to_all_single(resp_idx, req_idx, resp_splits, req_splits)
-    def get(self):
-        """Get the C-Handle for this object."""
-        return self._handle
-    def rank(self):
-        """Get the rank of this process in this communicator.
-        Returns
+    # Convert requested indices to local indices depending on partition.
-        -------
+    if resp_sum > 0:
-        int
+        resp_idx = partition.map_to_local(resp_idx)
-            The rank of this process.
-        """
-        return self._rank
-    def size(self):
+    # Collect the request value.
-        """Get the size of this communicator.
+    req_value = torch.empty(
+        (req_idx.size(0), *value.shape[1:]),
-        Returns
+        dtype=value.dtype,
-        -------
+        device=value.device,
-        int
+    )
-            The number of processes in this communicator.
+    dist.all_to_all_single(req_value, value[resp_idx], req_splits, resp_splits)
-        """
-        return self._size
-def is_supported():
-    """Check if DGL was built with NCCL support.
-    Returns
-    -------
-    bool
-        True if NCCL support was built in.
-    """
-    return _CAPI_DGLNCCLHasSupport()
+    # Permute the value back into the requested order.
+    return_value = torch.empty_like(req_value)
+    return_value[perm] = req_value
-_init_api("dgl.cuda.nccl")
+    return return_value
--- a/python/dgl/nn/pytorch/sparse_emb.py
+++ b/python/dgl/nn/pytorch/sparse_emb.py
@@ -9,7 +9,6 @@ from ...partition import NDArrayPartition
 from ...utils import create_shared_mem_array, get_shared_mem_array
 _STORE = None
-_COMM = None
 class NodeEmbedding:  # NodeEmbedding
@@ -78,7 +77,6 @@ class NodeEmbedding:  # NodeEmbedding
        partition=None,
    ):
        global _STORE
-        global _COMM
        if device is None:
            device = th.device("cpu")
@@ -132,25 +130,7 @@ class NodeEmbedding:  # NodeEmbedding
                )
            self._tensor = emb
        else:  # embeddings is stored in GPU memory.
-            # setup nccl communicator
+            self._comm = True
-            if _COMM is None:
-                if rank < 0:
-                    _COMM = nccl.Communicator(1, 0, nccl.UniqueId())
-                else:
-                    # needs to be set for nccl to work
-                    th.cuda.set_device(device)
-                    if rank == 0:
-                        # root process broadcasts nccl id
-                        nccl_id = nccl.UniqueId()
-                        self._store.set("nccl_root_id_sparse_emb", str(nccl_id))
-                    else:
-                        nccl_id = nccl.UniqueId(
-                            self._store.get("nccl_root_id_sparse_emb")
-                        )
-                    _COMM = nccl.Communicator(
-                        self._world_size, self._rank, nccl_id
-                    )
-            self._comm = _COMM
            if not self._partition:
                # for communication we need a partition
@@ -161,7 +141,7 @@ class NodeEmbedding:  # NodeEmbedding
                )
            # create local tensors for the weights
-            local_size = self._partition.local_size(self._comm.rank())
+            local_size = self._partition.local_size(max(self._rank, 0))
            # TODO(dlasalle): support 16-bit/half embeddings
            emb = th.empty(
@@ -187,15 +167,15 @@ class NodeEmbedding:  # NodeEmbedding
        device : th.device
            Target device to put the collected embeddings.
        """
-        if not self._comm or self._comm.size() == 1:
+        if not self._comm:
+            # embeddings are stored on the CPU
            emb = self._tensor[node_ids].to(device)
        else:
-            if self.world_size > 0:
+            # embeddings are stored on the GPU
-                emb = self._comm.sparse_all_to_all_pull(
+            # the following method also covers self._world_size = 0 or 1
+            emb = nccl.sparse_all_to_all_pull(
                node_ids, self._tensor, self._partition
            )
-            else:
-                emb = self._tensor[node_ids]
            emb = emb.to(device)
        if F.is_recording():
            emb = F.attach_grad(emb)
@@ -215,18 +195,6 @@ class NodeEmbedding:  # NodeEmbedding
        """
        return self._store
-    @property
-    def comm(self):
-        """Return dgl.cuda.nccl.Communicator for data
-        sharing across processes.
-        Returns
-        -------
-        dgl.cuda.nccl.Communicator
-            Communicator used for data sharing.
-        """
-        return self._comm
    @property
    def partition(self):
        """Return the partition identifying how the tensor is split across
@@ -361,7 +329,8 @@ class NodeEmbedding:  # NodeEmbedding
        if self._partition:
            idxs = F.copy_to(
                self._partition.get_local_indices(
-                    self._comm.rank(), ctx=F.context(self._tensor)
+                    max(self._rank, 0),
+                    ctx=F.context(self._tensor),
                ),
                F.context(values),
            )

--- a/python/dgl/optim/pytorch/sparse_optim.py
+++ b/python/dgl/optim/pytorch/sparse_optim.py
@@ -63,7 +63,6 @@ class SparseGradOptimizer(abc.ABC):
                ), "MultiGPU world_size for each embedding should be same."
        assert not self._rank is None
        assert not self._world_size is None
-        self._nccl_root_id = "SparseGradOptimizer.nccl_root_id"
    def step(self):
        """The step function.
@@ -74,7 +73,7 @@ class SparseGradOptimizer(abc.ABC):
        if self._first_step:
            for emb in self._params:
                for _, data in emb._trace:
-                    if data.grad.data.device.type == "cuda":
+                    if data.grad.device.type == "cuda":
                        # create a communicator
                        if self._device:
                            assert (
@@ -116,27 +115,7 @@ class SparseGradOptimizer(abc.ABC):
        """
    def _comm_setup(self):
-        # find a store to communicate the unique id through
+        self._comm = True
-        if len(self._params) > 0:
-            store = self._params[0].store
-            if self._rank < 0:
-                self._comm = nccl.Communicator(1, 0, nccl.UniqueId())
-            else:
-                th.cuda.set_device(self._device)
-                if self._rank == 0:
-                    # root process broadcasts nccl id
-                    nccl_id = nccl.UniqueId()
-                    uid = str(nccl_id)
-                    store.set(self._nccl_root_id, uid)
-                else:
-                    uid = store.get(self._nccl_root_id)
-                    nccl_id = nccl.UniqueId(uid)
-                # needs to be set for nccl to work
-                self._comm = nccl.Communicator(
-                    self._world_size, self._rank, nccl_id
-                )
-                th.distributed.barrier()
    def _shared_setup(self):
        for emb in self._params:
@@ -162,7 +141,6 @@ class SparseGradOptimizer(abc.ABC):
                self._opt_meta[emb_name] = opt_meta
    def _comm_step(self):
-        comm = self._comm
        with th.no_grad():
            idx_in = {}
            grad_in = {}
@@ -203,7 +181,7 @@ class SparseGradOptimizer(abc.ABC):
                (
                    idx_in[emb_name],
                    grad_in[emb_name],
-                ) = comm.sparse_all_to_all_push(idx, grad, partition=partition)
+                ) = nccl.sparse_all_to_all_push(idx, grad, partition=partition)
                if emb.partition:
                    # if the embedding is partitioned, map back to indexes
                    # into the local tensor

--- a/python/dgl/partition.py
+++ b/python/dgl/partition.py
@@ -592,5 +592,44 @@ class NDArrayPartition(object):
            )
        )
+    def generate_permutation(self, idxs):
+        """Produce a scheme that maps the given indices to separate partitions
+        and the counts of how many indices are in each partition.
+        Parameters
+        ----------
+        idxs: torch.Tensor.
+            A tensor with shape (`num_indices`,), representing global indices.
+        Return
+        ------
+        torch.Tensor.
+            A tensor with shape (`num_indices`,), representing the permutation
+            to re-order the indices by partition.
+        torch.Tensor.
+            A tensor with shape (`num_partition`,), representing the number of
+            indices per partition.
+        Examples
+        --------
+        >>> import torch
+        >>> from dgl.partition import NDArrayPartition
+        >>> part = NDArrayPartition(10, 2, mode="remainder")
+        >>> idx = torch.tensor([0, 2, 4, 5, 8, 8, 9], device="cuda:0")
+        >>> perm, splits_sum = part.generate_permutation(idx)
+        >>> perm
+        tensor([0, 1, 2, 4, 5, 3, 6], device='cuda:0')
+        >>> splits_sum
+        tensor([5, 2], device='cuda:0')
+        """
+        ret = _CAPI_DGLNDArrayPartitionGeneratePermutation(
+            self._partition, F.zerocopy_to_dgl_ndarray(idxs)
+        )
+        return F.zerocopy_from_dgl_ndarray(ret(0)), F.zerocopy_from_dgl_ndarray(
+            ret(1)
+        )
 _init_api("dgl.partition")
--- a/src/c_api_common.cc
+++ b/src/c_api_common.cc
 /**
 *  Copyright (c) 2018 by Contributors
- * @file c_runtime_api.cc
+ * @file c_api_common.cc
 * @brief DGL C API common implementations
 */
 #include "c_api_common.h"

--- a/src/partition/ndarray_partition.cc
+++ b/src/partition/ndarray_partition.cc
@@ -12,6 +12,7 @@
 #include <memory>
 #include <utility>
+#include "../c_api_common.h"
 #include "partition_op.h"
 using namespace dgl::runtime;
@@ -251,5 +252,15 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal")
      *rv = part->MapToGlobal(idxs, part_id);
    });
+DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGeneratePermutation")
+    .set_body([](DGLArgs args, DGLRetValue* rv) {
+      NDArrayPartitionRef part = args[0];
+      IdArray idxs = args[1];
+      std::pair<IdArray, NDArray> part_perm = part->GeneratePermutation(idxs);
+      *rv =
+          ConvertNDArrayVectorToPackedFunc({part_perm.first, part_perm.second});
+    });
 }  // namespace partition
 }  // namespace dgl
--- a/src/runtime/cuda/nccl_api.cu
+++ b/src/runtime/cuda/nccl_api.cu
--- a/src/runtime/cuda/nccl_api.h
+++ b/src/runtime/cuda/nccl_api.h
-/**
- *  Copyright (c) 2021-2022 by Contributors
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- *
- * @file nccl_api.h
- * @brief Wrapper around NCCL routines.
- */
-#ifndef DGL_RUNTIME_CUDA_NCCL_API_H_
-#define DGL_RUNTIME_CUDA_NCCL_API_H_
-#ifdef DGL_USE_NCCL
-#include "nccl.h"
-#else
-// if not compiling with NCCL, this class will only support communicators of
-// size 1.
-#define NCCL_UNIQUE_ID_BYTES 128
-typedef struct {
-  char internal[NCCL_UNIQUE_ID_BYTES];
-} ncclUniqueId;
-typedef int ncclComm_t;
-#endif
-#include <dgl/runtime/object.h>
-#include <string>
-namespace dgl {
-namespace runtime {
-namespace cuda {
-class NCCLUniqueId : public runtime::Object {
- public:
-  NCCLUniqueId();
-  static constexpr const char* _type_key = "cuda.NCCLUniqueId";
-  DGL_DECLARE_OBJECT_TYPE_INFO(NCCLUniqueId, Object);
-  ncclUniqueId Get() const;
-  std::string ToString() const;
-  void FromString(const std::string& str);
- private:
-  ncclUniqueId id_;
-};
-DGL_DEFINE_OBJECT_REF(NCCLUniqueIdRef, NCCLUniqueId);
-class NCCLCommunicator : public runtime::Object {
- public:
-  NCCLCommunicator(int size, int rank, ncclUniqueId id);
-  ~NCCLCommunicator();
-  // disable copying
-  NCCLCommunicator(const NCCLCommunicator& other) = delete;
-  NCCLCommunicator& operator=(const NCCLCommunicator& other);
-  ncclComm_t Get();
-  /**
-   * @brief Perform an all-to-all communication.
-   *
-   * @param send The continous array of data to send.
-   * @param recv The continous array of data to recieve.
-   * @param count The size of data to send to each rank.
-   * @param stream The stream to operate on.
-   */
-  template <typename IdType>
-  void AllToAll(
-      const IdType* send, IdType* recv, int64_t count, cudaStream_t stream);
-  /**
-   * @brief Perform an all-to-all variable sized communication.
-   *
-   * @tparam DType The type of value to send.
-   * @param send The arrays of data to send.
-   * @param send_prefix The prefix of each array to send.
-   * @param recv The arrays of data to recieve.
-   * @param recv_prefix The prefix of each array to recieve.
-   * @param type The type of data to send.
-   * @param stream The stream to operate on.
-   */
-  template <typename DType>
-  void AllToAllV(
-      const DType* const send, const int64_t* send_prefix, DType* const recv,
-      const int64_t* recv_prefix, cudaStream_t stream);
-  /**
-   * @brief Perform an all-to-all with sparse data (idx and value pairs). By
-   * necessity, the sizes of each message are variable.
-   *
-   * @tparam IdType The type of index.
-   * @tparam DType The type of value.
-   * @param send_idx The set of indexes to send on the device.
-   * @param send_value The set of values to send on the device.
-   * @param num_feat The number of values per index.
-   * @param send_prefix The exclusive prefix sum of elements to send on the
-   * host.
-   * @param recv_idx The set of indexes to recieve on the device.
-   * @param recv_value The set of values to recieve on the device.
-   * @param recv_prefix The exclusive prefix sum of the number of elements to
-   * recieve on the host.
-   * @param stream The stream to communicate on.
-   */
-  template <typename IdType, typename DType>
-  void SparseAllToAll(
-      const IdType* send_idx, const DType* send_value, const int64_t num_feat,
-      const int64_t* send_prefix, IdType* recv_idx, DType* recv_value,
-      const int64_t* recv_prefix, cudaStream_t stream);
-  int size() const;
-  int rank() const;
-  static constexpr const char* _type_key = "cuda.NCCLCommunicator";
-  DGL_DECLARE_OBJECT_TYPE_INFO(NCCLCommunicator, Object);
- private:
-  ncclComm_t comm_;
-  int size_;
-  int rank_;
-};
-DGL_DEFINE_OBJECT_REF(NCCLCommunicatorRef, NCCLCommunicator);
-}  // namespace cuda
-}  // namespace runtime
-}  // namespace dgl
-#endif  // DGL_RUNTIME_CUDA_NCCL_API_H_
--- a/tests/python/common/test_partition.py
+++ b/tests/python/common/test_partition.py
@@ -24,11 +24,13 @@ def test_get_node_partition_from_book(idtype):
    assert partition.num_parts() == 3
    assert partition.array_size() == 11
+    # Test map_to_local
    test_ids = F.copy_to(F.tensor([0, 2, 6, 7, 10], dtype=idtype), F.ctx())
    act_ids = partition.map_to_local(test_ids)
    exp_ids = F.copy_to(F.tensor([0, 2, 0, 1, 4], dtype=idtype), F.ctx())
    assert F.array_equal(act_ids, exp_ids)
+    # Test map_to_global
    test_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
    act_ids = partition.map_to_global(test_ids, 0)
    exp_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
@@ -43,3 +45,11 @@ def test_get_node_partition_from_book(idtype):
    act_ids = partition.map_to_global(test_ids, 2)
    exp_ids = F.copy_to(F.tensor([6, 7, 10], dtype=idtype), F.ctx())
    assert F.array_equal(act_ids, exp_ids)
+    # Test generate_permutation
+    test_ids = F.copy_to(F.tensor([6, 0, 7, 2, 10], dtype=idtype), F.ctx())
+    perm, split_sum = partition.generate_permutation(test_ids)
+    exp_perm = F.copy_to(F.tensor([1, 3, 0, 2, 4], dtype=idtype), F.ctx())
+    exp_sum = F.copy_to(F.tensor([2, 0, 3]), F.ctx())
+    assert F.array_equal(perm, exp_perm)
+    assert F.array_equal(split_sum, exp_sum)
--- a/tests/python/common/cuda/test_nccl.py
+++ b/tests/python/common/cuda/test_nccl.py
 import unittest
 import backend as F
+import torch
+import torch.distributed as dist
 from dgl.cuda import nccl
 from dgl.partition import NDArrayPartition
-def gen_test_id():
-    return "{:0256x}".format(78236728318467363)
-@unittest.skipIf(
-    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
-)
-def test_nccl_id():
-    nccl_id = nccl.UniqueId()
-    text = str(nccl_id)
-    nccl_id2 = nccl.UniqueId(id_str=text)
-    assert nccl_id == nccl_id2
-    nccl_id2 = nccl.UniqueId(gen_test_id())
-    assert nccl_id2 != nccl_id
-    nccl_id3 = nccl.UniqueId(str(nccl_id2))
-    assert nccl_id2 == nccl_id3
 @unittest.skipIf(
    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
 )
 def test_nccl_sparse_push_single_remainder():
-    nccl_id = nccl.UniqueId()
+    torch.cuda.set_device("cuda:0")
-    comm = nccl.Communicator(1, 0, nccl_id)
+    dist.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:12345",
+        world_size=1,
+        rank=0,
+    )
    index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
    value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)
    part = NDArrayPartition(10000, 1, "remainder")
-    ri, rv = comm.sparse_all_to_all_push(index, value, part)
+    ri, rv = nccl.sparse_all_to_all_push(index, value, part)
    assert F.array_equal(ri, index)
    assert F.array_equal(rv, value)
+    dist.destroy_process_group()
 @unittest.skipIf(
    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
 )
 def test_nccl_sparse_pull_single_remainder():
-    nccl_id = nccl.UniqueId()
+    torch.cuda.set_device("cuda:0")
-    comm = nccl.Communicator(1, 0, nccl_id)
+    dist.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:12345",
+        world_size=1,
+        rank=0,
+    )
    req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
    value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)
    part = NDArrayPartition(100000, 1, "remainder")
-    rv = comm.sparse_all_to_all_pull(req_index, value, part)
+    rv = nccl.sparse_all_to_all_pull(req_index, value, part)
    exp_rv = F.gather_row(value, req_index)
    assert F.array_equal(rv, exp_rv)
+    dist.destroy_process_group()
 @unittest.skipIf(
    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
 )
 def test_nccl_sparse_push_single_range():
-    nccl_id = nccl.UniqueId()
+    torch.cuda.set_device("cuda:0")
-    comm = nccl.Communicator(1, 0, nccl_id)
+    dist.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:12345",
+        world_size=1,
+        rank=0,
+    )
    index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
    value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)
@@ -78,17 +76,24 @@ def test_nccl_sparse_push_single_range():
    )
    part = NDArrayPartition(10000, 1, "range", part_ranges=part_ranges)
-    ri, rv = comm.sparse_all_to_all_push(index, value, part)
+    ri, rv = nccl.sparse_all_to_all_push(index, value, part)
    assert F.array_equal(ri, index)
    assert F.array_equal(rv, value)
+    dist.destroy_process_group()
 @unittest.skipIf(
    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
 )
 def test_nccl_sparse_pull_single_range():
-    nccl_id = nccl.UniqueId()
+    torch.cuda.set_device("cuda:0")
-    comm = nccl.Communicator(1, 0, nccl_id)
+    dist.init_process_group(
+        backend="nccl",
+        init_method="tcp://127.0.0.1:12345",
+        world_size=1,
+        rank=0,
+    )
    req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
    value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)
@@ -98,21 +103,15 @@ def test_nccl_sparse_pull_single_range():
    )
    part = NDArrayPartition(100000, 1, "range", part_ranges=part_ranges)
-    rv = comm.sparse_all_to_all_pull(req_index, value, part)
+    rv = nccl.sparse_all_to_all_pull(req_index, value, part)
    exp_rv = F.gather_row(value, req_index)
    assert F.array_equal(rv, exp_rv)
+    dist.destroy_process_group()
-@unittest.skipIf(
-    F._default_context_str == "cpu", reason="NCCL only runs on GPU."
-)
-def test_nccl_support():
-    # this is just a smoke test, as we don't have any other way to know
-    # if NCCL support is compiled in right now.
-    nccl.is_supported()
 if __name__ == "__main__":
-    test_nccl_id()
+    test_nccl_sparse_push_single_remainder()
-    test_nccl_sparse_push_single()
+    test_nccl_sparse_pull_single_remainder()
-    test_nccl_sparse_pull_single()
+    test_nccl_sparse_push_single_range()
+    test_nccl_sparse_pull_single_range()
--- a/tests/scripts/build_dgl.sh
+++ b/tests/scripts/build_dgl.sh
@@ -28,7 +28,7 @@ if [[ $arch == *"x86"* ]]; then
 fi
 if [[ $1 != "cpu" ]]; then
-    CMAKE_VARS="-DUSE_CUDA=ON -DUSE_NCCL=ON $CMAKE_VARS"
+    CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
 fi
 if [ -d build ]; then

--- a/nccl @ e11238b3
+++ b/nccl @ e11238b3
-Subproject commit e11238b3029795d33f958b5868d47c90c4f22628