Unverified Commit 8d5d8962 authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

[Refactor] Replace third_party/nccl with PyTorch's NCCL backend (#4989)

* expose GeneratePermutation

* add sparse_all_to_all_push

* add sparse_all_to_all_pull

* add unit test

* handle world_size=1

* remove python nccl wrapper

* remove the nccl dependency

* use pinned memory to speedup D2H copy

* fix lint

* resolve comments

* fix lint

* fix ut

* resolve comments
parent b1ec112e
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
[submodule "third_party/nanoflann"] [submodule "third_party/nanoflann"]
path = third_party/nanoflann path = third_party/nanoflann
url = https://github.com/jlblancoc/nanoflann url = https://github.com/jlblancoc/nanoflann
[submodule "third_party/nccl"]
path = third_party/nccl
url = https://github.com/nvidia/nccl
[submodule "third_party/libxsmm"] [submodule "third_party/libxsmm"]
path = third_party/libxsmm path = third_party/libxsmm
url = https://github.com/hfp/libxsmm.git url = https://github.com/hfp/libxsmm.git
......
...@@ -23,8 +23,6 @@ endif() ...@@ -23,8 +23,6 @@ endif()
# and add set(OPTION VALUE) to override these build options. # and add set(OPTION VALUE) to override these build options.
# Alernatively, use cmake -DOPTION=VALUE through command-line. # Alernatively, use cmake -DOPTION=VALUE through command-line.
dgl_option(USE_CUDA "Build with CUDA" OFF) dgl_option(USE_CUDA "Build with CUDA" OFF)
dgl_option(USE_NCCL "Build with NCCL support" OFF)
dgl_option(USE_SYSTEM_NCCL "Build using system's NCCL library" OFF)
dgl_option(USE_OPENMP "Build with OpenMP" ON) dgl_option(USE_OPENMP "Build with OpenMP" ON)
dgl_option(USE_AVX "Build with AVX optimization" OFF) dgl_option(USE_AVX "Build with AVX optimization" OFF)
dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON) dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON)
...@@ -171,25 +169,7 @@ list(APPEND DGL_SRC ${DGL_RPC_SRC}) ...@@ -171,25 +169,7 @@ list(APPEND DGL_SRC ${DGL_RPC_SRC})
if(USE_CUDA) if(USE_CUDA)
dgl_config_cuda(DGL_CUDA_SRC) dgl_config_cuda(DGL_CUDA_SRC)
list(APPEND DGL_SRC ${DGL_CUDA_SRC}) list(APPEND DGL_SRC ${DGL_CUDA_SRC})
if(USE_NCCL)
add_definitions(-DDGL_USE_NCCL)
if (USE_SYSTEM_NCCL)
include(cmake/util/FindNccl.cmake)
include_directories(${NCCL_INCLUDE_DIR})
else()
include(cmake/modules/NCCL.cmake)
cuda_include_directories(BEFORE ${NCCL_INCLUDE_DIR})
endif()
endif(USE_NCCL)
list(APPEND DGL_LINKER_LIBS ${NCCL_LIBRARY})
endif(USE_CUDA)
if(USE_CUDA)
cuda_add_library(dgl SHARED ${DGL_SRC}) cuda_add_library(dgl SHARED ${DGL_SRC})
if (USE_NCCL AND NOT USE_SYSTEM_NCCL)
add_dependencies(dgl nccl_external)
endif()
else(USE_CUDA) else(USE_CUDA)
add_library(dgl SHARED ${DGL_SRC}) add_library(dgl SHARED ${DGL_SRC})
endif(USE_CUDA) endif(USE_CUDA)
......
include(ExternalProject)
# set path to submodule
set(NCCL_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/nccl")
# NCCL doesn't have CMAKE, so build externally
ExternalProject_Add(nccl_external
SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND ""
BUILD_COMMAND
env
make
"src.build"
"-j"
"BUILDDIR=${NCCL_BUILD_DIR}"
BUILD_BYPRODUCTS "${NCCL_BUILD_DIR}/lib/libnccl_static.a"
INSTALL_COMMAND ""
)
# set output variables
set(NCCL_FOUND TRUE)
set(NCCL_LIBRARY "${NCCL_BUILD_DIR}/lib/libnccl_static.a")
set(NCCL_INCLUDE_DIR "${NCCL_BUILD_DIR}/include")
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Tries to find NCCL headers and libraries.
#
# Usage of this module as follows:
#
# find_package(NCCL)
#
# Variables used by this module, they can change the default behaviour and need
# to be set before calling find_package:
#
# NCCL_ROOT - When set, this path is inspected instead of standard library
# locations as the root of the NCCL installation.
# The environment variable NCCL_ROOT overrides this variable.
#
# This module defines
# Nccl_FOUND, whether nccl has been found
# NCCL_INCLUDE_DIR, directory containing header
# NCCL_LIBRARY, directory containing nccl library
# NCCL_LIB_NAME, nccl library name
# USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the
# location of the nccl library. This would disable
# switching between static and shared.
#
# This module assumes that the user has already called find_package(CUDA)
#
# This file is from https://github.com/dmlc/xgboost, with modifications to
# check the version.
if (NCCL_LIBRARY)
if(NOT USE_NCCL_LIB_PATH)
# Don't cache NCCL_LIBRARY to enable switching between static and shared.
unset(NCCL_LIBRARY CACHE)
endif(NOT USE_NCCL_LIB_PATH)
endif()
if (BUILD_WITH_SHARED_NCCL)
# libnccl.so
set(NCCL_LIB_NAME nccl)
else ()
# libnccl_static.a
set(NCCL_LIB_NAME nccl_static)
endif (BUILD_WITH_SHARED_NCCL)
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
# make sure it has point to point support
file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_CODE REGEX "^#define[ \t]+NCCL_VERSION_CODE[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
string(REGEX REPLACE "^.*NCCL_VERSION_CODE[ \t]+([0-9]+).*$" "\\1" NCCL_VERSION "${NCCL_VERSION_CODE}")
find_library(NCCL_LIBRARY
NAMES ${NCCL_LIB_NAME}
PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
if ("${NCCL_VERSION}" LESS "2700")
message(FATAL_ERROR "Require nccl >= 2700, but found ${NCCL_LIBRARY}==${NCCL_VERSION}")
else()
message(STATUS "Using nccl library: ${NCCL_LIBRARY} ${NCCL_VERSION}")
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Nccl DEFAULT_MSG
NCCL_INCLUDE_DIR NCCL_LIBRARY)
mark_as_advanced(
NCCL_INCLUDE_DIR
NCCL_LIBRARY
)
""" CUDA wrappers """ """ CUDA wrappers """
from . import nccl from .. import backend as F
if F.get_preferred_backend() == "pytorch":
from . import nccl
"""API creating NCCL communicators.""" """API wrapping NCCL primitives."""
from .. import backend as F import torch
from .._ffi.function import _init_api import torch.distributed as dist
_COMM_MODES_MAP = {"remainder": 0}
def sparse_all_to_all_push(idx, value, partition):
class UniqueId(object):
"""Class for allowing python code to create and communicate NCCL Unique
IDs, needed for creating communicators.
"""
def __init__(self, id_str=None):
"""Create an object reference the current NCCL unique id."""
if id_str:
if isinstance(id_str, bytes):
id_str = id_str.decode("utf-8")
self._handle = _CAPI_DGLNCCLUniqueIdFromString(id_str)
else:
self._handle = _CAPI_DGLNCCLGetUniqueId()
def get(self):
"""Get the C-handle for this object."""
return self._handle
def __str__(self):
return _CAPI_DGLNCCLUniqueIdToString(self._handle)
def __repr__(self):
return "UniqueId[{}]".format(str(self))
def __eq__(self, other):
return str(self) == str(other)
class Communicator(object):
"""High-level wrapper for NCCL communication."""
def __init__(self, size, rank, unique_id):
"""Create a new NCCL communicator.
Parameters
----------
size : int
The number of processes in the communicator.
rank : int
The rank of the current process in the communicator.
unique_id : NCCLUniqueId
The unique id of the root process (rank=0).
Examples
--------
>>> from dgl.cuda.nccl import Communicator, UniqueId
The root process will generate a unique NCCL id and communicate it
to the other processes.
>>> uid = UniqueId()
>>> store.set('nccl_root_id', str(uid))
And all other processes create unique ids from the root processes.
>>> uid = UniqueId(store.get('nccl_root_id'))
Then, all processes should create the communicator.
>>> comm = Communicator(world_size, rank, uid)
"""
assert rank < size, (
"The rank of a process must be less than the "
"size of the communicator."
)
self._handle = _CAPI_DGLNCCLCreateComm(size, rank, unique_id.get())
self._rank = rank
self._size = size
def sparse_all_to_all_push(self, idx, value, partition):
"""Perform an all-to-all-v operation, where by all processors send out """Perform an all-to-all-v operation, where by all processors send out
a set of indices and corresponding values. Indices and values, a set of indices and corresponding values. Indices and values,
corresponding to the current process, will copied into the output corresponding to the current process, will copied into the output
arrays. arrays.
Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
Parameters Parameters
---------- ----------
idx : tensor idx : torch.Tensor
The 1D set of indices to send to other processors. The 1D set of indices to send to other processors.
value : tensor value : torch.Tensor
The multi-dimension set of values to send to other processors. The multi-dimension set of values to send to other processors.
The first dimension must match that of `idx`. The first dimension must match that of `idx`.
partition : NDArrayPartition partition : NDArrayPartition
...@@ -95,9 +25,9 @@ class Communicator(object): ...@@ -95,9 +25,9 @@ class Communicator(object):
Returns Returns
------- -------
tensor torch.Tensor
The 1D tensor of the recieved indices. The 1D tensor of the recieved indices.
tensor torch.Tensor
The set of recieved values. The set of recieved values.
Examples Examples
...@@ -108,7 +38,7 @@ class Communicator(object): ...@@ -108,7 +38,7 @@ class Communicator(object):
striped across processes can be generated via: striped across processes can be generated via:
>>> from dgl.partition import NDArrayPartition >>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' ) >>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')
With this partition, each processor can send values to be associatd With this partition, each processor can send values to be associatd
with vertices in the graph. So if we have an array `global_idxs` of all of with vertices in the graph. So if we have an array `global_idxs` of all of
...@@ -116,7 +46,7 @@ class Communicator(object): ...@@ -116,7 +46,7 @@ class Communicator(object):
`global_values` containing the new values associated with the neighbors, `global_values` containing the new values associated with the neighbors,
we communicate them to the own processes via: we communicate them to the own processes via:
>>> my_idxs, my_values = comm.sparse_all_to_all_push(global_idxs, global_values, part) >>> my_idxs, my_values = nccl.sparse_all_to_all_push(global_idxs, global_values, part)
This communication pattern is common when communicating gradient This communication pattern is common when communicating gradient
updates for node embeddings. updates for node embeddings.
...@@ -128,26 +58,54 @@ class Communicator(object): ...@@ -128,26 +58,54 @@ class Communicator(object):
indices for processe 0 of '[0, 8, 10, 0, 2, 4, 8, 8]', and for indices for processe 0 of '[0, 8, 10, 0, 2, 4, 8, 8]', and for
process 1 of '[3, 9, 5, 9]'. process 1 of '[3, 9, 5, 9]'.
""" """
out_idx, out_value = _CAPI_DGLNCCLSparseAllToAllPush( if not dist.is_initialized() or dist.get_world_size() == 1:
self.get(), return idx, value
F.zerocopy_to_dgl_ndarray(idx), assert (
F.zerocopy_to_dgl_ndarray(value), dist.get_backend() == "nccl"
partition.get(), ), "requires NCCL backend to communicate CUDA tensors."
)
return ( perm, send_splits = partition.generate_permutation(idx)
F.zerocopy_from_dgl_ndarray(out_idx), perm = perm.long()
F.zerocopy_from_dgl_ndarray(out_value),
# Get receive splits.
recv_splits = torch.empty_like(send_splits)
dist.all_to_all_single(recv_splits, send_splits)
# Use pinned memory to speedup D2H copy.
recv_splits = recv_splits.to("cpu", non_blocking=True)
send_splits = send_splits.to("cpu", non_blocking=True)
send_idx = idx[perm]
send_value = value[perm]
# Wait D2H copy finish.
torch.cuda.current_stream().synchronize()
recv_sum = recv_splits.sum()
recv_splits = recv_splits.tolist()
send_splits = send_splits.tolist()
# Send idx.
recv_idx = torch.empty((recv_sum,), dtype=idx.dtype, device=idx.device)
dist.all_to_all_single(recv_idx, send_idx, recv_splits, send_splits)
# Send value.
recv_value = torch.empty(
(recv_sum, *value.shape[1:]), dtype=value.dtype, device=value.device
) )
dist.all_to_all_single(recv_value, send_value, recv_splits, send_splits)
return recv_idx, recv_value
def sparse_all_to_all_pull(self, req_idx, value, partition): def sparse_all_to_all_pull(req_idx, value, partition):
"""Perform an all-to-all-v operation, where by all processors request """Perform an all-to-all-v operation, where by all processors request
the values corresponding to their set of indices. the values corresponding to their set of indices.
Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
Parameters Parameters
---------- ----------
req_idx : IdArray req_idx : torch.Tensor
The set of indices this processor is requesting. The set of indices this processor is requesting.
value : NDArray value : torch.Tensor
The multi-dimension set of values that can be requested from The multi-dimension set of values that can be requested from
this processor. this processor.
partition : NDArrayPartition partition : NDArrayPartition
...@@ -156,7 +114,7 @@ class Communicator(object): ...@@ -156,7 +114,7 @@ class Communicator(object):
Returns Returns
------- -------
tensor torch.Tensor
The set of recieved values, corresponding to `req_idx`. The set of recieved values, corresponding to `req_idx`.
Examples Examples
...@@ -167,7 +125,7 @@ class Communicator(object): ...@@ -167,7 +125,7 @@ class Communicator(object):
striped across processes can be generated via: striped across processes can be generated via:
>>> from dgl.partition import NDArrayPartition >>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' ) >>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')
With this partition, each processor can request values/features With this partition, each processor can request values/features
associated with vertices in the graph. So in the case where we have associated with vertices in the graph. So in the case where we have
...@@ -175,7 +133,7 @@ class Communicator(object): ...@@ -175,7 +133,7 @@ class Communicator(object):
has a tensor 'node_feat' storing the features of nodes it owns in has a tensor 'node_feat' storing the features of nodes it owns in
the partition, the features can be requested via: the partition, the features can be requested via:
>>> nbr_values = comm.sparse_all_to_all_pull(nbr_idxs, node_feat, part) >>> nbr_values = nccl.sparse_all_to_all_pull(nbr_idxs, node_feat, part)
Then two the arrays 'nbr_idxs' and 'nbr_values' forms the sparse Then two the arrays 'nbr_idxs' and 'nbr_values' forms the sparse
set of features, where 'nbr_idxs[i]' is the global node id, and set of features, where 'nbr_idxs[i]' is the global node id, and
...@@ -183,48 +141,49 @@ class Communicator(object): ...@@ -183,48 +141,49 @@ class Communicator(object):
communication pattern is useful for node features or node communication pattern is useful for node features or node
embeddings. embeddings.
""" """
out_value = _CAPI_DGLNCCLSparseAllToAllPull( if not dist.is_initialized() or dist.get_world_size() == 1:
self.get(), return value[req_idx.long()]
F.zerocopy_to_dgl_ndarray(req_idx), assert (
F.zerocopy_to_dgl_ndarray(value), dist.get_backend() == "nccl"
partition.get(), ), "requires NCCL backend to communicate CUDA tensors."
perm, req_splits = partition.generate_permutation(req_idx)
perm = perm.long()
# Get response splits.
resp_splits = torch.empty_like(req_splits)
dist.all_to_all_single(resp_splits, req_splits)
# Use pinned memory to speedup D2H copy.
resp_splits = resp_splits.to("cpu", non_blocking=True)
req_splits = req_splits.to("cpu", non_blocking=True)
req_idx = req_idx[perm]
# Wait D2H copy finish.
torch.cuda.current_stream().synchronize()
resp_sum = resp_splits.sum()
resp_splits = resp_splits.tolist()
req_splits = req_splits.tolist()
# Gather requested indices.
resp_idx = torch.empty(
(resp_sum,), dtype=req_idx.dtype, device=req_idx.device
) )
return F.zerocopy_from_dgl_ndarray(out_value) dist.all_to_all_single(resp_idx, req_idx, resp_splits, req_splits)
def get(self):
"""Get the C-Handle for this object."""
return self._handle
def rank(self):
"""Get the rank of this process in this communicator.
Returns # Convert requested indices to local indices depending on partition.
------- if resp_sum > 0:
int resp_idx = partition.map_to_local(resp_idx)
The rank of this process.
"""
return self._rank
def size(self): # Collect the request value.
"""Get the size of this communicator. req_value = torch.empty(
(req_idx.size(0), *value.shape[1:]),
Returns dtype=value.dtype,
------- device=value.device,
int )
The number of processes in this communicator. dist.all_to_all_single(req_value, value[resp_idx], req_splits, resp_splits)
"""
return self._size
def is_supported():
"""Check if DGL was built with NCCL support.
Returns
-------
bool
True if NCCL support was built in.
"""
return _CAPI_DGLNCCLHasSupport()
# Permute the value back into the requested order.
return_value = torch.empty_like(req_value)
return_value[perm] = req_value
_init_api("dgl.cuda.nccl") return return_value
...@@ -9,7 +9,6 @@ from ...partition import NDArrayPartition ...@@ -9,7 +9,6 @@ from ...partition import NDArrayPartition
from ...utils import create_shared_mem_array, get_shared_mem_array from ...utils import create_shared_mem_array, get_shared_mem_array
_STORE = None _STORE = None
_COMM = None
class NodeEmbedding: # NodeEmbedding class NodeEmbedding: # NodeEmbedding
...@@ -78,7 +77,6 @@ class NodeEmbedding: # NodeEmbedding ...@@ -78,7 +77,6 @@ class NodeEmbedding: # NodeEmbedding
partition=None, partition=None,
): ):
global _STORE global _STORE
global _COMM
if device is None: if device is None:
device = th.device("cpu") device = th.device("cpu")
...@@ -132,25 +130,7 @@ class NodeEmbedding: # NodeEmbedding ...@@ -132,25 +130,7 @@ class NodeEmbedding: # NodeEmbedding
) )
self._tensor = emb self._tensor = emb
else: # embeddings is stored in GPU memory. else: # embeddings is stored in GPU memory.
# setup nccl communicator self._comm = True
if _COMM is None:
if rank < 0:
_COMM = nccl.Communicator(1, 0, nccl.UniqueId())
else:
# needs to be set for nccl to work
th.cuda.set_device(device)
if rank == 0:
# root process broadcasts nccl id
nccl_id = nccl.UniqueId()
self._store.set("nccl_root_id_sparse_emb", str(nccl_id))
else:
nccl_id = nccl.UniqueId(
self._store.get("nccl_root_id_sparse_emb")
)
_COMM = nccl.Communicator(
self._world_size, self._rank, nccl_id
)
self._comm = _COMM
if not self._partition: if not self._partition:
# for communication we need a partition # for communication we need a partition
...@@ -161,7 +141,7 @@ class NodeEmbedding: # NodeEmbedding ...@@ -161,7 +141,7 @@ class NodeEmbedding: # NodeEmbedding
) )
# create local tensors for the weights # create local tensors for the weights
local_size = self._partition.local_size(self._comm.rank()) local_size = self._partition.local_size(max(self._rank, 0))
# TODO(dlasalle): support 16-bit/half embeddings # TODO(dlasalle): support 16-bit/half embeddings
emb = th.empty( emb = th.empty(
...@@ -187,15 +167,15 @@ class NodeEmbedding: # NodeEmbedding ...@@ -187,15 +167,15 @@ class NodeEmbedding: # NodeEmbedding
device : th.device device : th.device
Target device to put the collected embeddings. Target device to put the collected embeddings.
""" """
if not self._comm or self._comm.size() == 1: if not self._comm:
# embeddings are stored on the CPU
emb = self._tensor[node_ids].to(device) emb = self._tensor[node_ids].to(device)
else: else:
if self.world_size > 0: # embeddings are stored on the GPU
emb = self._comm.sparse_all_to_all_pull( # the following method also covers self._world_size = 0 or 1
emb = nccl.sparse_all_to_all_pull(
node_ids, self._tensor, self._partition node_ids, self._tensor, self._partition
) )
else:
emb = self._tensor[node_ids]
emb = emb.to(device) emb = emb.to(device)
if F.is_recording(): if F.is_recording():
emb = F.attach_grad(emb) emb = F.attach_grad(emb)
...@@ -215,18 +195,6 @@ class NodeEmbedding: # NodeEmbedding ...@@ -215,18 +195,6 @@ class NodeEmbedding: # NodeEmbedding
""" """
return self._store return self._store
@property
def comm(self):
"""Return dgl.cuda.nccl.Communicator for data
sharing across processes.
Returns
-------
dgl.cuda.nccl.Communicator
Communicator used for data sharing.
"""
return self._comm
@property @property
def partition(self): def partition(self):
"""Return the partition identifying how the tensor is split across """Return the partition identifying how the tensor is split across
...@@ -361,7 +329,8 @@ class NodeEmbedding: # NodeEmbedding ...@@ -361,7 +329,8 @@ class NodeEmbedding: # NodeEmbedding
if self._partition: if self._partition:
idxs = F.copy_to( idxs = F.copy_to(
self._partition.get_local_indices( self._partition.get_local_indices(
self._comm.rank(), ctx=F.context(self._tensor) max(self._rank, 0),
ctx=F.context(self._tensor),
), ),
F.context(values), F.context(values),
) )
......
...@@ -63,7 +63,6 @@ class SparseGradOptimizer(abc.ABC): ...@@ -63,7 +63,6 @@ class SparseGradOptimizer(abc.ABC):
), "MultiGPU world_size for each embedding should be same." ), "MultiGPU world_size for each embedding should be same."
assert not self._rank is None assert not self._rank is None
assert not self._world_size is None assert not self._world_size is None
self._nccl_root_id = "SparseGradOptimizer.nccl_root_id"
def step(self): def step(self):
"""The step function. """The step function.
...@@ -74,7 +73,7 @@ class SparseGradOptimizer(abc.ABC): ...@@ -74,7 +73,7 @@ class SparseGradOptimizer(abc.ABC):
if self._first_step: if self._first_step:
for emb in self._params: for emb in self._params:
for _, data in emb._trace: for _, data in emb._trace:
if data.grad.data.device.type == "cuda": if data.grad.device.type == "cuda":
# create a communicator # create a communicator
if self._device: if self._device:
assert ( assert (
...@@ -116,27 +115,7 @@ class SparseGradOptimizer(abc.ABC): ...@@ -116,27 +115,7 @@ class SparseGradOptimizer(abc.ABC):
""" """
def _comm_setup(self): def _comm_setup(self):
# find a store to communicate the unique id through self._comm = True
if len(self._params) > 0:
store = self._params[0].store
if self._rank < 0:
self._comm = nccl.Communicator(1, 0, nccl.UniqueId())
else:
th.cuda.set_device(self._device)
if self._rank == 0:
# root process broadcasts nccl id
nccl_id = nccl.UniqueId()
uid = str(nccl_id)
store.set(self._nccl_root_id, uid)
else:
uid = store.get(self._nccl_root_id)
nccl_id = nccl.UniqueId(uid)
# needs to be set for nccl to work
self._comm = nccl.Communicator(
self._world_size, self._rank, nccl_id
)
th.distributed.barrier()
def _shared_setup(self): def _shared_setup(self):
for emb in self._params: for emb in self._params:
...@@ -162,7 +141,6 @@ class SparseGradOptimizer(abc.ABC): ...@@ -162,7 +141,6 @@ class SparseGradOptimizer(abc.ABC):
self._opt_meta[emb_name] = opt_meta self._opt_meta[emb_name] = opt_meta
def _comm_step(self): def _comm_step(self):
comm = self._comm
with th.no_grad(): with th.no_grad():
idx_in = {} idx_in = {}
grad_in = {} grad_in = {}
...@@ -203,7 +181,7 @@ class SparseGradOptimizer(abc.ABC): ...@@ -203,7 +181,7 @@ class SparseGradOptimizer(abc.ABC):
( (
idx_in[emb_name], idx_in[emb_name],
grad_in[emb_name], grad_in[emb_name],
) = comm.sparse_all_to_all_push(idx, grad, partition=partition) ) = nccl.sparse_all_to_all_push(idx, grad, partition=partition)
if emb.partition: if emb.partition:
# if the embedding is partitioned, map back to indexes # if the embedding is partitioned, map back to indexes
# into the local tensor # into the local tensor
......
...@@ -592,5 +592,44 @@ class NDArrayPartition(object): ...@@ -592,5 +592,44 @@ class NDArrayPartition(object):
) )
) )
def generate_permutation(self, idxs):
"""Produce a scheme that maps the given indices to separate partitions
and the counts of how many indices are in each partition.
Parameters
----------
idxs: torch.Tensor.
A tensor with shape (`num_indices`,), representing global indices.
Return
------
torch.Tensor.
A tensor with shape (`num_indices`,), representing the permutation
to re-order the indices by partition.
torch.Tensor.
A tensor with shape (`num_partition`,), representing the number of
indices per partition.
Examples
--------
>>> import torch
>>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(10, 2, mode="remainder")
>>> idx = torch.tensor([0, 2, 4, 5, 8, 8, 9], device="cuda:0")
>>> perm, splits_sum = part.generate_permutation(idx)
>>> perm
tensor([0, 1, 2, 4, 5, 3, 6], device='cuda:0')
>>> splits_sum
tensor([5, 2], device='cuda:0')
"""
ret = _CAPI_DGLNDArrayPartitionGeneratePermutation(
self._partition, F.zerocopy_to_dgl_ndarray(idxs)
)
return F.zerocopy_from_dgl_ndarray(ret(0)), F.zerocopy_from_dgl_ndarray(
ret(1)
)
_init_api("dgl.partition") _init_api("dgl.partition")
/** /**
* Copyright (c) 2018 by Contributors * Copyright (c) 2018 by Contributors
* @file c_runtime_api.cc * @file c_api_common.cc
* @brief DGL C API common implementations * @brief DGL C API common implementations
*/ */
#include "c_api_common.h" #include "c_api_common.h"
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <memory> #include <memory>
#include <utility> #include <utility>
#include "../c_api_common.h"
#include "partition_op.h" #include "partition_op.h"
using namespace dgl::runtime; using namespace dgl::runtime;
...@@ -251,5 +252,15 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal") ...@@ -251,5 +252,15 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal")
*rv = part->MapToGlobal(idxs, part_id); *rv = part->MapToGlobal(idxs, part_id);
}); });
DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGeneratePermutation")
.set_body([](DGLArgs args, DGLRetValue* rv) {
NDArrayPartitionRef part = args[0];
IdArray idxs = args[1];
std::pair<IdArray, NDArray> part_perm = part->GeneratePermutation(idxs);
*rv =
ConvertNDArrayVectorToPackedFunc({part_perm.first, part_perm.second});
});
} // namespace partition } // namespace partition
} // namespace dgl } // namespace dgl
This diff is collapsed.
/**
* Copyright (c) 2021-2022 by Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* @file nccl_api.h
* @brief Wrapper around NCCL routines.
*/
#ifndef DGL_RUNTIME_CUDA_NCCL_API_H_
#define DGL_RUNTIME_CUDA_NCCL_API_H_
#ifdef DGL_USE_NCCL
#include "nccl.h"
#else
// if not compiling with NCCL, this class will only support communicators of
// size 1.
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct {
char internal[NCCL_UNIQUE_ID_BYTES];
} ncclUniqueId;
typedef int ncclComm_t;
#endif
#include <dgl/runtime/object.h>
#include <string>
namespace dgl {
namespace runtime {
namespace cuda {
class NCCLUniqueId : public runtime::Object {
public:
NCCLUniqueId();
static constexpr const char* _type_key = "cuda.NCCLUniqueId";
DGL_DECLARE_OBJECT_TYPE_INFO(NCCLUniqueId, Object);
ncclUniqueId Get() const;
std::string ToString() const;
void FromString(const std::string& str);
private:
ncclUniqueId id_;
};
DGL_DEFINE_OBJECT_REF(NCCLUniqueIdRef, NCCLUniqueId);
class NCCLCommunicator : public runtime::Object {
public:
NCCLCommunicator(int size, int rank, ncclUniqueId id);
~NCCLCommunicator();
// disable copying
NCCLCommunicator(const NCCLCommunicator& other) = delete;
NCCLCommunicator& operator=(const NCCLCommunicator& other);
ncclComm_t Get();
/**
* @brief Perform an all-to-all communication.
*
* @param send The continous array of data to send.
* @param recv The continous array of data to recieve.
* @param count The size of data to send to each rank.
* @param stream The stream to operate on.
*/
template <typename IdType>
void AllToAll(
const IdType* send, IdType* recv, int64_t count, cudaStream_t stream);
/**
* @brief Perform an all-to-all variable sized communication.
*
* @tparam DType The type of value to send.
* @param send The arrays of data to send.
* @param send_prefix The prefix of each array to send.
* @param recv The arrays of data to recieve.
* @param recv_prefix The prefix of each array to recieve.
* @param type The type of data to send.
* @param stream The stream to operate on.
*/
template <typename DType>
void AllToAllV(
const DType* const send, const int64_t* send_prefix, DType* const recv,
const int64_t* recv_prefix, cudaStream_t stream);
/**
* @brief Perform an all-to-all with sparse data (idx and value pairs). By
* necessity, the sizes of each message are variable.
*
* @tparam IdType The type of index.
* @tparam DType The type of value.
* @param send_idx The set of indexes to send on the device.
* @param send_value The set of values to send on the device.
* @param num_feat The number of values per index.
* @param send_prefix The exclusive prefix sum of elements to send on the
* host.
* @param recv_idx The set of indexes to recieve on the device.
* @param recv_value The set of values to recieve on the device.
* @param recv_prefix The exclusive prefix sum of the number of elements to
* recieve on the host.
* @param stream The stream to communicate on.
*/
template <typename IdType, typename DType>
void SparseAllToAll(
const IdType* send_idx, const DType* send_value, const int64_t num_feat,
const int64_t* send_prefix, IdType* recv_idx, DType* recv_value,
const int64_t* recv_prefix, cudaStream_t stream);
int size() const;
int rank() const;
static constexpr const char* _type_key = "cuda.NCCLCommunicator";
DGL_DECLARE_OBJECT_TYPE_INFO(NCCLCommunicator, Object);
private:
ncclComm_t comm_;
int size_;
int rank_;
};
DGL_DEFINE_OBJECT_REF(NCCLCommunicatorRef, NCCLCommunicator);
} // namespace cuda
} // namespace runtime
} // namespace dgl
#endif // DGL_RUNTIME_CUDA_NCCL_API_H_
...@@ -24,11 +24,13 @@ def test_get_node_partition_from_book(idtype): ...@@ -24,11 +24,13 @@ def test_get_node_partition_from_book(idtype):
assert partition.num_parts() == 3 assert partition.num_parts() == 3
assert partition.array_size() == 11 assert partition.array_size() == 11
# Test map_to_local
test_ids = F.copy_to(F.tensor([0, 2, 6, 7, 10], dtype=idtype), F.ctx()) test_ids = F.copy_to(F.tensor([0, 2, 6, 7, 10], dtype=idtype), F.ctx())
act_ids = partition.map_to_local(test_ids) act_ids = partition.map_to_local(test_ids)
exp_ids = F.copy_to(F.tensor([0, 2, 0, 1, 4], dtype=idtype), F.ctx()) exp_ids = F.copy_to(F.tensor([0, 2, 0, 1, 4], dtype=idtype), F.ctx())
assert F.array_equal(act_ids, exp_ids) assert F.array_equal(act_ids, exp_ids)
# Test map_to_global
test_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx()) test_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
act_ids = partition.map_to_global(test_ids, 0) act_ids = partition.map_to_global(test_ids, 0)
exp_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx()) exp_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
...@@ -43,3 +45,11 @@ def test_get_node_partition_from_book(idtype): ...@@ -43,3 +45,11 @@ def test_get_node_partition_from_book(idtype):
act_ids = partition.map_to_global(test_ids, 2) act_ids = partition.map_to_global(test_ids, 2)
exp_ids = F.copy_to(F.tensor([6, 7, 10], dtype=idtype), F.ctx()) exp_ids = F.copy_to(F.tensor([6, 7, 10], dtype=idtype), F.ctx())
assert F.array_equal(act_ids, exp_ids) assert F.array_equal(act_ids, exp_ids)
# Test generate_permutation
test_ids = F.copy_to(F.tensor([6, 0, 7, 2, 10], dtype=idtype), F.ctx())
perm, split_sum = partition.generate_permutation(test_ids)
exp_perm = F.copy_to(F.tensor([1, 3, 0, 2, 4], dtype=idtype), F.ctx())
exp_sum = F.copy_to(F.tensor([2, 0, 3]), F.ctx())
assert F.array_equal(perm, exp_perm)
assert F.array_equal(split_sum, exp_sum)
import unittest import unittest
import backend as F import backend as F
import torch
import torch.distributed as dist
from dgl.cuda import nccl from dgl.cuda import nccl
from dgl.partition import NDArrayPartition from dgl.partition import NDArrayPartition
def gen_test_id():
return "{:0256x}".format(78236728318467363)
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_id():
nccl_id = nccl.UniqueId()
text = str(nccl_id)
nccl_id2 = nccl.UniqueId(id_str=text)
assert nccl_id == nccl_id2
nccl_id2 = nccl.UniqueId(gen_test_id())
assert nccl_id2 != nccl_id
nccl_id3 = nccl.UniqueId(str(nccl_id2))
assert nccl_id2 == nccl_id3
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU." F._default_context_str == "cpu", reason="NCCL only runs on GPU."
) )
def test_nccl_sparse_push_single_remainder(): def test_nccl_sparse_push_single_remainder():
nccl_id = nccl.UniqueId() torch.cuda.set_device("cuda:0")
comm = nccl.Communicator(1, 0, nccl_id) dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
index = F.randint([10000], F.int32, F.ctx(), 0, 10000) index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0) value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)
part = NDArrayPartition(10000, 1, "remainder") part = NDArrayPartition(10000, 1, "remainder")
ri, rv = comm.sparse_all_to_all_push(index, value, part) ri, rv = nccl.sparse_all_to_all_push(index, value, part)
assert F.array_equal(ri, index) assert F.array_equal(ri, index)
assert F.array_equal(rv, value) assert F.array_equal(rv, value)
dist.destroy_process_group()
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU." F._default_context_str == "cpu", reason="NCCL only runs on GPU."
) )
def test_nccl_sparse_pull_single_remainder(): def test_nccl_sparse_pull_single_remainder():
nccl_id = nccl.UniqueId() torch.cuda.set_device("cuda:0")
comm = nccl.Communicator(1, 0, nccl_id) dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000) req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0) value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)
part = NDArrayPartition(100000, 1, "remainder") part = NDArrayPartition(100000, 1, "remainder")
rv = comm.sparse_all_to_all_pull(req_index, value, part) rv = nccl.sparse_all_to_all_pull(req_index, value, part)
exp_rv = F.gather_row(value, req_index) exp_rv = F.gather_row(value, req_index)
assert F.array_equal(rv, exp_rv) assert F.array_equal(rv, exp_rv)
dist.destroy_process_group()
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU." F._default_context_str == "cpu", reason="NCCL only runs on GPU."
) )
def test_nccl_sparse_push_single_range(): def test_nccl_sparse_push_single_range():
nccl_id = nccl.UniqueId() torch.cuda.set_device("cuda:0")
comm = nccl.Communicator(1, 0, nccl_id) dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
index = F.randint([10000], F.int32, F.ctx(), 0, 10000) index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0) value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)
...@@ -78,17 +76,24 @@ def test_nccl_sparse_push_single_range(): ...@@ -78,17 +76,24 @@ def test_nccl_sparse_push_single_range():
) )
part = NDArrayPartition(10000, 1, "range", part_ranges=part_ranges) part = NDArrayPartition(10000, 1, "range", part_ranges=part_ranges)
ri, rv = comm.sparse_all_to_all_push(index, value, part) ri, rv = nccl.sparse_all_to_all_push(index, value, part)
assert F.array_equal(ri, index) assert F.array_equal(ri, index)
assert F.array_equal(rv, value) assert F.array_equal(rv, value)
dist.destroy_process_group()
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU." F._default_context_str == "cpu", reason="NCCL only runs on GPU."
) )
def test_nccl_sparse_pull_single_range(): def test_nccl_sparse_pull_single_range():
nccl_id = nccl.UniqueId() torch.cuda.set_device("cuda:0")
comm = nccl.Communicator(1, 0, nccl_id) dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000) req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0) value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)
...@@ -98,21 +103,15 @@ def test_nccl_sparse_pull_single_range(): ...@@ -98,21 +103,15 @@ def test_nccl_sparse_pull_single_range():
) )
part = NDArrayPartition(100000, 1, "range", part_ranges=part_ranges) part = NDArrayPartition(100000, 1, "range", part_ranges=part_ranges)
rv = comm.sparse_all_to_all_pull(req_index, value, part) rv = nccl.sparse_all_to_all_pull(req_index, value, part)
exp_rv = F.gather_row(value, req_index) exp_rv = F.gather_row(value, req_index)
assert F.array_equal(rv, exp_rv) assert F.array_equal(rv, exp_rv)
dist.destroy_process_group()
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_support():
# this is just a smoke test, as we don't have any other way to know
# if NCCL support is compiled in right now.
nccl.is_supported()
if __name__ == "__main__": if __name__ == "__main__":
test_nccl_id() test_nccl_sparse_push_single_remainder()
test_nccl_sparse_push_single() test_nccl_sparse_pull_single_remainder()
test_nccl_sparse_pull_single() test_nccl_sparse_push_single_range()
test_nccl_sparse_pull_single_range()
...@@ -28,7 +28,7 @@ if [[ $arch == *"x86"* ]]; then ...@@ -28,7 +28,7 @@ if [[ $arch == *"x86"* ]]; then
fi fi
if [[ $1 != "cpu" ]]; then if [[ $1 != "cpu" ]]; then
CMAKE_VARS="-DUSE_CUDA=ON -DUSE_NCCL=ON $CMAKE_VARS" CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
fi fi
if [ -d build ]; then if [ -d build ]; then
......
Subproject commit e11238b3029795d33f958b5868d47c90c4f22628
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment