Unverified Commit 8d5d8962 authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

[Refactor] Replace third_party/nccl with PyTorch's NCCL backend (#4989)

* expose GeneratePermutation

* add sparse_all_to_all_push

* add sparse_all_to_all_pull

* add unit test

* handle world_size=1

* remove python nccl wrapper

* remove the nccl dependency

* use pinned memory to speedup D2H copy

* fix lint

* resolve comments

* fix lint

* fix ut

* resolve comments
parent b1ec112e
......@@ -22,9 +22,6 @@
[submodule "third_party/nanoflann"]
path = third_party/nanoflann
url = https://github.com/jlblancoc/nanoflann
[submodule "third_party/nccl"]
path = third_party/nccl
url = https://github.com/nvidia/nccl
[submodule "third_party/libxsmm"]
path = third_party/libxsmm
url = https://github.com/hfp/libxsmm.git
......
......@@ -23,8 +23,6 @@ endif()
# and add set(OPTION VALUE) to override these build options.
# Alernatively, use cmake -DOPTION=VALUE through command-line.
dgl_option(USE_CUDA "Build with CUDA" OFF)
dgl_option(USE_NCCL "Build with NCCL support" OFF)
dgl_option(USE_SYSTEM_NCCL "Build using system's NCCL library" OFF)
dgl_option(USE_OPENMP "Build with OpenMP" ON)
dgl_option(USE_AVX "Build with AVX optimization" OFF)
dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON)
......@@ -171,25 +169,7 @@ list(APPEND DGL_SRC ${DGL_RPC_SRC})
if(USE_CUDA)
dgl_config_cuda(DGL_CUDA_SRC)
list(APPEND DGL_SRC ${DGL_CUDA_SRC})
if(USE_NCCL)
add_definitions(-DDGL_USE_NCCL)
if (USE_SYSTEM_NCCL)
include(cmake/util/FindNccl.cmake)
include_directories(${NCCL_INCLUDE_DIR})
else()
include(cmake/modules/NCCL.cmake)
cuda_include_directories(BEFORE ${NCCL_INCLUDE_DIR})
endif()
endif(USE_NCCL)
list(APPEND DGL_LINKER_LIBS ${NCCL_LIBRARY})
endif(USE_CUDA)
if(USE_CUDA)
cuda_add_library(dgl SHARED ${DGL_SRC})
if (USE_NCCL AND NOT USE_SYSTEM_NCCL)
add_dependencies(dgl nccl_external)
endif()
else(USE_CUDA)
add_library(dgl SHARED ${DGL_SRC})
endif(USE_CUDA)
......
include(ExternalProject)
# set path to submodule
set(NCCL_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/nccl")
# NCCL doesn't have CMAKE, so build externally
ExternalProject_Add(nccl_external
SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND ""
BUILD_COMMAND
env
make
"src.build"
"-j"
"BUILDDIR=${NCCL_BUILD_DIR}"
BUILD_BYPRODUCTS "${NCCL_BUILD_DIR}/lib/libnccl_static.a"
INSTALL_COMMAND ""
)
# set output variables
set(NCCL_FOUND TRUE)
set(NCCL_LIBRARY "${NCCL_BUILD_DIR}/lib/libnccl_static.a")
set(NCCL_INCLUDE_DIR "${NCCL_BUILD_DIR}/include")
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Tries to find NCCL headers and libraries.
#
# Usage of this module as follows:
#
# find_package(NCCL)
#
# Variables used by this module, they can change the default behaviour and need
# to be set before calling find_package:
#
# NCCL_ROOT - When set, this path is inspected instead of standard library
# locations as the root of the NCCL installation.
# The environment variable NCCL_ROOT overrides this variable.
#
# This module defines
# Nccl_FOUND, whether nccl has been found
# NCCL_INCLUDE_DIR, directory containing header
# NCCL_LIBRARY, directory containing nccl library
# NCCL_LIB_NAME, nccl library name
# USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the
# location of the nccl library. This would disable
# switching between static and shared.
#
# This module assumes that the user has already called find_package(CUDA)
#
# This file is from https://github.com/dmlc/xgboost, with modifications to
# check the version.
if (NCCL_LIBRARY)
if(NOT USE_NCCL_LIB_PATH)
# Don't cache NCCL_LIBRARY to enable switching between static and shared.
unset(NCCL_LIBRARY CACHE)
endif(NOT USE_NCCL_LIB_PATH)
endif()
if (BUILD_WITH_SHARED_NCCL)
# libnccl.so
set(NCCL_LIB_NAME nccl)
else ()
# libnccl_static.a
set(NCCL_LIB_NAME nccl_static)
endif (BUILD_WITH_SHARED_NCCL)
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
# make sure it has point to point support
file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_CODE REGEX "^#define[ \t]+NCCL_VERSION_CODE[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
string(REGEX REPLACE "^.*NCCL_VERSION_CODE[ \t]+([0-9]+).*$" "\\1" NCCL_VERSION "${NCCL_VERSION_CODE}")
find_library(NCCL_LIBRARY
NAMES ${NCCL_LIB_NAME}
PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
if ("${NCCL_VERSION}" LESS "2700")
message(FATAL_ERROR "Require nccl >= 2700, but found ${NCCL_LIBRARY}==${NCCL_VERSION}")
else()
message(STATUS "Using nccl library: ${NCCL_LIBRARY} ${NCCL_VERSION}")
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Nccl DEFAULT_MSG
NCCL_INCLUDE_DIR NCCL_LIBRARY)
mark_as_advanced(
NCCL_INCLUDE_DIR
NCCL_LIBRARY
)
""" CUDA wrappers """
from . import nccl
from .. import backend as F
if F.get_preferred_backend() == "pytorch":
from . import nccl
"""API creating NCCL communicators."""
"""API wrapping NCCL primitives."""
from .. import backend as F
from .._ffi.function import _init_api
import torch
import torch.distributed as dist
_COMM_MODES_MAP = {"remainder": 0}
class UniqueId(object):
"""Class for allowing python code to create and communicate NCCL Unique
IDs, needed for creating communicators.
"""
def __init__(self, id_str=None):
"""Create an object reference the current NCCL unique id."""
if id_str:
if isinstance(id_str, bytes):
id_str = id_str.decode("utf-8")
self._handle = _CAPI_DGLNCCLUniqueIdFromString(id_str)
else:
self._handle = _CAPI_DGLNCCLGetUniqueId()
def get(self):
"""Get the C-handle for this object."""
return self._handle
def __str__(self):
return _CAPI_DGLNCCLUniqueIdToString(self._handle)
def __repr__(self):
return "UniqueId[{}]".format(str(self))
def __eq__(self, other):
return str(self) == str(other)
class Communicator(object):
"""High-level wrapper for NCCL communication."""
def __init__(self, size, rank, unique_id):
"""Create a new NCCL communicator.
Parameters
----------
size : int
The number of processes in the communicator.
rank : int
The rank of the current process in the communicator.
unique_id : NCCLUniqueId
The unique id of the root process (rank=0).
Examples
--------
>>> from dgl.cuda.nccl import Communicator, UniqueId
The root process will generate a unique NCCL id and communicate it
to the other processes.
>>> uid = UniqueId()
>>> store.set('nccl_root_id', str(uid))
And all other processes create unique ids from the root processes.
>>> uid = UniqueId(store.get('nccl_root_id'))
Then, all processes should create the communicator.
>>> comm = Communicator(world_size, rank, uid)
"""
assert rank < size, (
"The rank of a process must be less than the "
"size of the communicator."
)
self._handle = _CAPI_DGLNCCLCreateComm(size, rank, unique_id.get())
self._rank = rank
self._size = size
def sparse_all_to_all_push(self, idx, value, partition):
def sparse_all_to_all_push(idx, value, partition):
"""Perform an all-to-all-v operation, where by all processors send out
a set of indices and corresponding values. Indices and values,
corresponding to the current process, will copied into the output
arrays.
Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
Parameters
----------
idx : tensor
idx : torch.Tensor
The 1D set of indices to send to other processors.
value : tensor
value : torch.Tensor
The multi-dimension set of values to send to other processors.
The first dimension must match that of `idx`.
partition : NDArrayPartition
......@@ -95,9 +25,9 @@ class Communicator(object):
Returns
-------
tensor
torch.Tensor
The 1D tensor of the recieved indices.
tensor
torch.Tensor
The set of recieved values.
Examples
......@@ -108,7 +38,7 @@ class Communicator(object):
striped across processes can be generated via:
>>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' )
>>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')
With this partition, each processor can send values to be associatd
with vertices in the graph. So if we have an array `global_idxs` of all of
......@@ -116,7 +46,7 @@ class Communicator(object):
`global_values` containing the new values associated with the neighbors,
we communicate them to the own processes via:
>>> my_idxs, my_values = comm.sparse_all_to_all_push(global_idxs, global_values, part)
>>> my_idxs, my_values = nccl.sparse_all_to_all_push(global_idxs, global_values, part)
This communication pattern is common when communicating gradient
updates for node embeddings.
......@@ -128,26 +58,54 @@ class Communicator(object):
indices for processe 0 of '[0, 8, 10, 0, 2, 4, 8, 8]', and for
process 1 of '[3, 9, 5, 9]'.
"""
out_idx, out_value = _CAPI_DGLNCCLSparseAllToAllPush(
self.get(),
F.zerocopy_to_dgl_ndarray(idx),
F.zerocopy_to_dgl_ndarray(value),
partition.get(),
)
return (
F.zerocopy_from_dgl_ndarray(out_idx),
F.zerocopy_from_dgl_ndarray(out_value),
if not dist.is_initialized() or dist.get_world_size() == 1:
return idx, value
assert (
dist.get_backend() == "nccl"
), "requires NCCL backend to communicate CUDA tensors."
perm, send_splits = partition.generate_permutation(idx)
perm = perm.long()
# Get receive splits.
recv_splits = torch.empty_like(send_splits)
dist.all_to_all_single(recv_splits, send_splits)
# Use pinned memory to speedup D2H copy.
recv_splits = recv_splits.to("cpu", non_blocking=True)
send_splits = send_splits.to("cpu", non_blocking=True)
send_idx = idx[perm]
send_value = value[perm]
# Wait D2H copy finish.
torch.cuda.current_stream().synchronize()
recv_sum = recv_splits.sum()
recv_splits = recv_splits.tolist()
send_splits = send_splits.tolist()
# Send idx.
recv_idx = torch.empty((recv_sum,), dtype=idx.dtype, device=idx.device)
dist.all_to_all_single(recv_idx, send_idx, recv_splits, send_splits)
# Send value.
recv_value = torch.empty(
(recv_sum, *value.shape[1:]), dtype=value.dtype, device=value.device
)
dist.all_to_all_single(recv_value, send_value, recv_splits, send_splits)
return recv_idx, recv_value
def sparse_all_to_all_pull(self, req_idx, value, partition):
def sparse_all_to_all_pull(req_idx, value, partition):
"""Perform an all-to-all-v operation, where by all processors request
the values corresponding to their set of indices.
Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
Parameters
----------
req_idx : IdArray
req_idx : torch.Tensor
The set of indices this processor is requesting.
value : NDArray
value : torch.Tensor
The multi-dimension set of values that can be requested from
this processor.
partition : NDArrayPartition
......@@ -156,7 +114,7 @@ class Communicator(object):
Returns
-------
tensor
torch.Tensor
The set of recieved values, corresponding to `req_idx`.
Examples
......@@ -167,7 +125,7 @@ class Communicator(object):
striped across processes can be generated via:
>>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' )
>>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')
With this partition, each processor can request values/features
associated with vertices in the graph. So in the case where we have
......@@ -175,7 +133,7 @@ class Communicator(object):
has a tensor 'node_feat' storing the features of nodes it owns in
the partition, the features can be requested via:
>>> nbr_values = comm.sparse_all_to_all_pull(nbr_idxs, node_feat, part)
>>> nbr_values = nccl.sparse_all_to_all_pull(nbr_idxs, node_feat, part)
Then two the arrays 'nbr_idxs' and 'nbr_values' forms the sparse
set of features, where 'nbr_idxs[i]' is the global node id, and
......@@ -183,48 +141,49 @@ class Communicator(object):
communication pattern is useful for node features or node
embeddings.
"""
out_value = _CAPI_DGLNCCLSparseAllToAllPull(
self.get(),
F.zerocopy_to_dgl_ndarray(req_idx),
F.zerocopy_to_dgl_ndarray(value),
partition.get(),
if not dist.is_initialized() or dist.get_world_size() == 1:
return value[req_idx.long()]
assert (
dist.get_backend() == "nccl"
), "requires NCCL backend to communicate CUDA tensors."
perm, req_splits = partition.generate_permutation(req_idx)
perm = perm.long()
# Get response splits.
resp_splits = torch.empty_like(req_splits)
dist.all_to_all_single(resp_splits, req_splits)
# Use pinned memory to speedup D2H copy.
resp_splits = resp_splits.to("cpu", non_blocking=True)
req_splits = req_splits.to("cpu", non_blocking=True)
req_idx = req_idx[perm]
# Wait D2H copy finish.
torch.cuda.current_stream().synchronize()
resp_sum = resp_splits.sum()
resp_splits = resp_splits.tolist()
req_splits = req_splits.tolist()
# Gather requested indices.
resp_idx = torch.empty(
(resp_sum,), dtype=req_idx.dtype, device=req_idx.device
)
return F.zerocopy_from_dgl_ndarray(out_value)
def get(self):
"""Get the C-Handle for this object."""
return self._handle
def rank(self):
"""Get the rank of this process in this communicator.
dist.all_to_all_single(resp_idx, req_idx, resp_splits, req_splits)
Returns
-------
int
The rank of this process.
"""
return self._rank
# Convert requested indices to local indices depending on partition.
if resp_sum > 0:
resp_idx = partition.map_to_local(resp_idx)
def size(self):
"""Get the size of this communicator.
Returns
-------
int
The number of processes in this communicator.
"""
return self._size
def is_supported():
"""Check if DGL was built with NCCL support.
Returns
-------
bool
True if NCCL support was built in.
"""
return _CAPI_DGLNCCLHasSupport()
# Collect the request value.
req_value = torch.empty(
(req_idx.size(0), *value.shape[1:]),
dtype=value.dtype,
device=value.device,
)
dist.all_to_all_single(req_value, value[resp_idx], req_splits, resp_splits)
# Permute the value back into the requested order.
return_value = torch.empty_like(req_value)
return_value[perm] = req_value
_init_api("dgl.cuda.nccl")
return return_value
......@@ -9,7 +9,6 @@ from ...partition import NDArrayPartition
from ...utils import create_shared_mem_array, get_shared_mem_array
_STORE = None
_COMM = None
class NodeEmbedding: # NodeEmbedding
......@@ -78,7 +77,6 @@ class NodeEmbedding: # NodeEmbedding
partition=None,
):
global _STORE
global _COMM
if device is None:
device = th.device("cpu")
......@@ -132,25 +130,7 @@ class NodeEmbedding: # NodeEmbedding
)
self._tensor = emb
else: # embeddings is stored in GPU memory.
# setup nccl communicator
if _COMM is None:
if rank < 0:
_COMM = nccl.Communicator(1, 0, nccl.UniqueId())
else:
# needs to be set for nccl to work
th.cuda.set_device(device)
if rank == 0:
# root process broadcasts nccl id
nccl_id = nccl.UniqueId()
self._store.set("nccl_root_id_sparse_emb", str(nccl_id))
else:
nccl_id = nccl.UniqueId(
self._store.get("nccl_root_id_sparse_emb")
)
_COMM = nccl.Communicator(
self._world_size, self._rank, nccl_id
)
self._comm = _COMM
self._comm = True
if not self._partition:
# for communication we need a partition
......@@ -161,7 +141,7 @@ class NodeEmbedding: # NodeEmbedding
)
# create local tensors for the weights
local_size = self._partition.local_size(self._comm.rank())
local_size = self._partition.local_size(max(self._rank, 0))
# TODO(dlasalle): support 16-bit/half embeddings
emb = th.empty(
......@@ -187,15 +167,15 @@ class NodeEmbedding: # NodeEmbedding
device : th.device
Target device to put the collected embeddings.
"""
if not self._comm or self._comm.size() == 1:
if not self._comm:
# embeddings are stored on the CPU
emb = self._tensor[node_ids].to(device)
else:
if self.world_size > 0:
emb = self._comm.sparse_all_to_all_pull(
# embeddings are stored on the GPU
# the following method also covers self._world_size = 0 or 1
emb = nccl.sparse_all_to_all_pull(
node_ids, self._tensor, self._partition
)
else:
emb = self._tensor[node_ids]
emb = emb.to(device)
if F.is_recording():
emb = F.attach_grad(emb)
......@@ -215,18 +195,6 @@ class NodeEmbedding: # NodeEmbedding
"""
return self._store
@property
def comm(self):
"""Return dgl.cuda.nccl.Communicator for data
sharing across processes.
Returns
-------
dgl.cuda.nccl.Communicator
Communicator used for data sharing.
"""
return self._comm
@property
def partition(self):
"""Return the partition identifying how the tensor is split across
......@@ -361,7 +329,8 @@ class NodeEmbedding: # NodeEmbedding
if self._partition:
idxs = F.copy_to(
self._partition.get_local_indices(
self._comm.rank(), ctx=F.context(self._tensor)
max(self._rank, 0),
ctx=F.context(self._tensor),
),
F.context(values),
)
......
......@@ -63,7 +63,6 @@ class SparseGradOptimizer(abc.ABC):
), "MultiGPU world_size for each embedding should be same."
assert not self._rank is None
assert not self._world_size is None
self._nccl_root_id = "SparseGradOptimizer.nccl_root_id"
def step(self):
"""The step function.
......@@ -74,7 +73,7 @@ class SparseGradOptimizer(abc.ABC):
if self._first_step:
for emb in self._params:
for _, data in emb._trace:
if data.grad.data.device.type == "cuda":
if data.grad.device.type == "cuda":
# create a communicator
if self._device:
assert (
......@@ -116,27 +115,7 @@ class SparseGradOptimizer(abc.ABC):
"""
def _comm_setup(self):
# find a store to communicate the unique id through
if len(self._params) > 0:
store = self._params[0].store
if self._rank < 0:
self._comm = nccl.Communicator(1, 0, nccl.UniqueId())
else:
th.cuda.set_device(self._device)
if self._rank == 0:
# root process broadcasts nccl id
nccl_id = nccl.UniqueId()
uid = str(nccl_id)
store.set(self._nccl_root_id, uid)
else:
uid = store.get(self._nccl_root_id)
nccl_id = nccl.UniqueId(uid)
# needs to be set for nccl to work
self._comm = nccl.Communicator(
self._world_size, self._rank, nccl_id
)
th.distributed.barrier()
self._comm = True
def _shared_setup(self):
for emb in self._params:
......@@ -162,7 +141,6 @@ class SparseGradOptimizer(abc.ABC):
self._opt_meta[emb_name] = opt_meta
def _comm_step(self):
comm = self._comm
with th.no_grad():
idx_in = {}
grad_in = {}
......@@ -203,7 +181,7 @@ class SparseGradOptimizer(abc.ABC):
(
idx_in[emb_name],
grad_in[emb_name],
) = comm.sparse_all_to_all_push(idx, grad, partition=partition)
) = nccl.sparse_all_to_all_push(idx, grad, partition=partition)
if emb.partition:
# if the embedding is partitioned, map back to indexes
# into the local tensor
......
......@@ -592,5 +592,44 @@ class NDArrayPartition(object):
)
)
def generate_permutation(self, idxs):
"""Produce a scheme that maps the given indices to separate partitions
and the counts of how many indices are in each partition.
Parameters
----------
idxs: torch.Tensor.
A tensor with shape (`num_indices`,), representing global indices.
Return
------
torch.Tensor.
A tensor with shape (`num_indices`,), representing the permutation
to re-order the indices by partition.
torch.Tensor.
A tensor with shape (`num_partition`,), representing the number of
indices per partition.
Examples
--------
>>> import torch
>>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(10, 2, mode="remainder")
>>> idx = torch.tensor([0, 2, 4, 5, 8, 8, 9], device="cuda:0")
>>> perm, splits_sum = part.generate_permutation(idx)
>>> perm
tensor([0, 1, 2, 4, 5, 3, 6], device='cuda:0')
>>> splits_sum
tensor([5, 2], device='cuda:0')
"""
ret = _CAPI_DGLNDArrayPartitionGeneratePermutation(
self._partition, F.zerocopy_to_dgl_ndarray(idxs)
)
return F.zerocopy_from_dgl_ndarray(ret(0)), F.zerocopy_from_dgl_ndarray(
ret(1)
)
_init_api("dgl.partition")
/**
* Copyright (c) 2018 by Contributors
* @file c_runtime_api.cc
* @file c_api_common.cc
* @brief DGL C API common implementations
*/
#include "c_api_common.h"
......
......@@ -12,6 +12,7 @@
#include <memory>
#include <utility>
#include "../c_api_common.h"
#include "partition_op.h"
using namespace dgl::runtime;
......@@ -251,5 +252,15 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal")
*rv = part->MapToGlobal(idxs, part_id);
});
DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGeneratePermutation")
.set_body([](DGLArgs args, DGLRetValue* rv) {
NDArrayPartitionRef part = args[0];
IdArray idxs = args[1];
std::pair<IdArray, NDArray> part_perm = part->GeneratePermutation(idxs);
*rv =
ConvertNDArrayVectorToPackedFunc({part_perm.first, part_perm.second});
});
} // namespace partition
} // namespace dgl
This diff is collapsed.
/**
* Copyright (c) 2021-2022 by Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* @file nccl_api.h
* @brief Wrapper around NCCL routines.
*/
#ifndef DGL_RUNTIME_CUDA_NCCL_API_H_
#define DGL_RUNTIME_CUDA_NCCL_API_H_
#ifdef DGL_USE_NCCL
#include "nccl.h"
#else
// if not compiling with NCCL, this class will only support communicators of
// size 1.
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct {
char internal[NCCL_UNIQUE_ID_BYTES];
} ncclUniqueId;
typedef int ncclComm_t;
#endif
#include <dgl/runtime/object.h>
#include <string>
namespace dgl {
namespace runtime {
namespace cuda {
class NCCLUniqueId : public runtime::Object {
public:
NCCLUniqueId();
static constexpr const char* _type_key = "cuda.NCCLUniqueId";
DGL_DECLARE_OBJECT_TYPE_INFO(NCCLUniqueId, Object);
ncclUniqueId Get() const;
std::string ToString() const;
void FromString(const std::string& str);
private:
ncclUniqueId id_;
};
DGL_DEFINE_OBJECT_REF(NCCLUniqueIdRef, NCCLUniqueId);
class NCCLCommunicator : public runtime::Object {
public:
NCCLCommunicator(int size, int rank, ncclUniqueId id);
~NCCLCommunicator();
// disable copying
NCCLCommunicator(const NCCLCommunicator& other) = delete;
NCCLCommunicator& operator=(const NCCLCommunicator& other);
ncclComm_t Get();
/**
* @brief Perform an all-to-all communication.
*
* @param send The continous array of data to send.
* @param recv The continous array of data to recieve.
* @param count The size of data to send to each rank.
* @param stream The stream to operate on.
*/
template <typename IdType>
void AllToAll(
const IdType* send, IdType* recv, int64_t count, cudaStream_t stream);
/**
* @brief Perform an all-to-all variable sized communication.
*
* @tparam DType The type of value to send.
* @param send The arrays of data to send.
* @param send_prefix The prefix of each array to send.
* @param recv The arrays of data to recieve.
* @param recv_prefix The prefix of each array to recieve.
* @param type The type of data to send.
* @param stream The stream to operate on.
*/
template <typename DType>
void AllToAllV(
const DType* const send, const int64_t* send_prefix, DType* const recv,
const int64_t* recv_prefix, cudaStream_t stream);
/**
* @brief Perform an all-to-all with sparse data (idx and value pairs). By
* necessity, the sizes of each message are variable.
*
* @tparam IdType The type of index.
* @tparam DType The type of value.
* @param send_idx The set of indexes to send on the device.
* @param send_value The set of values to send on the device.
* @param num_feat The number of values per index.
* @param send_prefix The exclusive prefix sum of elements to send on the
* host.
* @param recv_idx The set of indexes to recieve on the device.
* @param recv_value The set of values to recieve on the device.
* @param recv_prefix The exclusive prefix sum of the number of elements to
* recieve on the host.
* @param stream The stream to communicate on.
*/
template <typename IdType, typename DType>
void SparseAllToAll(
const IdType* send_idx, const DType* send_value, const int64_t num_feat,
const int64_t* send_prefix, IdType* recv_idx, DType* recv_value,
const int64_t* recv_prefix, cudaStream_t stream);
int size() const;
int rank() const;
static constexpr const char* _type_key = "cuda.NCCLCommunicator";
DGL_DECLARE_OBJECT_TYPE_INFO(NCCLCommunicator, Object);
private:
ncclComm_t comm_;
int size_;
int rank_;
};
DGL_DEFINE_OBJECT_REF(NCCLCommunicatorRef, NCCLCommunicator);
} // namespace cuda
} // namespace runtime
} // namespace dgl
#endif // DGL_RUNTIME_CUDA_NCCL_API_H_
......@@ -24,11 +24,13 @@ def test_get_node_partition_from_book(idtype):
assert partition.num_parts() == 3
assert partition.array_size() == 11
# Test map_to_local
test_ids = F.copy_to(F.tensor([0, 2, 6, 7, 10], dtype=idtype), F.ctx())
act_ids = partition.map_to_local(test_ids)
exp_ids = F.copy_to(F.tensor([0, 2, 0, 1, 4], dtype=idtype), F.ctx())
assert F.array_equal(act_ids, exp_ids)
# Test map_to_global
test_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
act_ids = partition.map_to_global(test_ids, 0)
exp_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
......@@ -43,3 +45,11 @@ def test_get_node_partition_from_book(idtype):
act_ids = partition.map_to_global(test_ids, 2)
exp_ids = F.copy_to(F.tensor([6, 7, 10], dtype=idtype), F.ctx())
assert F.array_equal(act_ids, exp_ids)
# Test generate_permutation
test_ids = F.copy_to(F.tensor([6, 0, 7, 2, 10], dtype=idtype), F.ctx())
perm, split_sum = partition.generate_permutation(test_ids)
exp_perm = F.copy_to(F.tensor([1, 3, 0, 2, 4], dtype=idtype), F.ctx())
exp_sum = F.copy_to(F.tensor([2, 0, 3]), F.ctx())
assert F.array_equal(perm, exp_perm)
assert F.array_equal(split_sum, exp_sum)
import unittest
import backend as F
import torch
import torch.distributed as dist
from dgl.cuda import nccl
from dgl.partition import NDArrayPartition
def gen_test_id():
return "{:0256x}".format(78236728318467363)
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_id():
nccl_id = nccl.UniqueId()
text = str(nccl_id)
nccl_id2 = nccl.UniqueId(id_str=text)
assert nccl_id == nccl_id2
nccl_id2 = nccl.UniqueId(gen_test_id())
assert nccl_id2 != nccl_id
nccl_id3 = nccl.UniqueId(str(nccl_id2))
assert nccl_id2 == nccl_id3
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_sparse_push_single_remainder():
nccl_id = nccl.UniqueId()
comm = nccl.Communicator(1, 0, nccl_id)
torch.cuda.set_device("cuda:0")
dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)
part = NDArrayPartition(10000, 1, "remainder")
ri, rv = comm.sparse_all_to_all_push(index, value, part)
ri, rv = nccl.sparse_all_to_all_push(index, value, part)
assert F.array_equal(ri, index)
assert F.array_equal(rv, value)
dist.destroy_process_group()
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_sparse_pull_single_remainder():
nccl_id = nccl.UniqueId()
comm = nccl.Communicator(1, 0, nccl_id)
torch.cuda.set_device("cuda:0")
dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)
part = NDArrayPartition(100000, 1, "remainder")
rv = comm.sparse_all_to_all_pull(req_index, value, part)
rv = nccl.sparse_all_to_all_pull(req_index, value, part)
exp_rv = F.gather_row(value, req_index)
assert F.array_equal(rv, exp_rv)
dist.destroy_process_group()
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_sparse_push_single_range():
nccl_id = nccl.UniqueId()
comm = nccl.Communicator(1, 0, nccl_id)
torch.cuda.set_device("cuda:0")
dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)
......@@ -78,17 +76,24 @@ def test_nccl_sparse_push_single_range():
)
part = NDArrayPartition(10000, 1, "range", part_ranges=part_ranges)
ri, rv = comm.sparse_all_to_all_push(index, value, part)
ri, rv = nccl.sparse_all_to_all_push(index, value, part)
assert F.array_equal(ri, index)
assert F.array_equal(rv, value)
dist.destroy_process_group()
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_sparse_pull_single_range():
nccl_id = nccl.UniqueId()
comm = nccl.Communicator(1, 0, nccl_id)
torch.cuda.set_device("cuda:0")
dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)
......@@ -98,21 +103,15 @@ def test_nccl_sparse_pull_single_range():
)
part = NDArrayPartition(100000, 1, "range", part_ranges=part_ranges)
rv = comm.sparse_all_to_all_pull(req_index, value, part)
rv = nccl.sparse_all_to_all_pull(req_index, value, part)
exp_rv = F.gather_row(value, req_index)
assert F.array_equal(rv, exp_rv)
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_support():
# this is just a smoke test, as we don't have any other way to know
# if NCCL support is compiled in right now.
nccl.is_supported()
dist.destroy_process_group()
if __name__ == "__main__":
test_nccl_id()
test_nccl_sparse_push_single()
test_nccl_sparse_pull_single()
test_nccl_sparse_push_single_remainder()
test_nccl_sparse_pull_single_remainder()
test_nccl_sparse_push_single_range()
test_nccl_sparse_pull_single_range()
......@@ -28,7 +28,7 @@ if [[ $arch == *"x86"* ]]; then
fi
if [[ $1 != "cpu" ]]; then
CMAKE_VARS="-DUSE_CUDA=ON -DUSE_NCCL=ON $CMAKE_VARS"
CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
fi
if [ -d build ]; then
......
Subproject commit e11238b3029795d33f958b5868d47c90c4f22628
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment