Unverified Commit 8d5d8962 authored by Xin Yao's avatar Xin Yao Committed by GitHub
Browse files

[Refactor] Replace third_party/nccl with PyTorch's NCCL backend (#4989)

* expose GeneratePermutation

* add sparse_all_to_all_push

* add sparse_all_to_all_pull

* add unit test

* handle world_size=1

* remove python nccl wrapper

* remove the nccl dependency

* use pinned memory to speedup D2H copy

* fix lint

* resolve comments

* fix lint

* fix ut

* resolve comments
parent b1ec112e
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
[submodule "third_party/nanoflann"] [submodule "third_party/nanoflann"]
path = third_party/nanoflann path = third_party/nanoflann
url = https://github.com/jlblancoc/nanoflann url = https://github.com/jlblancoc/nanoflann
[submodule "third_party/nccl"]
path = third_party/nccl
url = https://github.com/nvidia/nccl
[submodule "third_party/libxsmm"] [submodule "third_party/libxsmm"]
path = third_party/libxsmm path = third_party/libxsmm
url = https://github.com/hfp/libxsmm.git url = https://github.com/hfp/libxsmm.git
......
...@@ -23,8 +23,6 @@ endif() ...@@ -23,8 +23,6 @@ endif()
# and add set(OPTION VALUE) to override these build options. # and add set(OPTION VALUE) to override these build options.
# Alernatively, use cmake -DOPTION=VALUE through command-line. # Alernatively, use cmake -DOPTION=VALUE through command-line.
dgl_option(USE_CUDA "Build with CUDA" OFF) dgl_option(USE_CUDA "Build with CUDA" OFF)
dgl_option(USE_NCCL "Build with NCCL support" OFF)
dgl_option(USE_SYSTEM_NCCL "Build using system's NCCL library" OFF)
dgl_option(USE_OPENMP "Build with OpenMP" ON) dgl_option(USE_OPENMP "Build with OpenMP" ON)
dgl_option(USE_AVX "Build with AVX optimization" OFF) dgl_option(USE_AVX "Build with AVX optimization" OFF)
dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON) dgl_option(USE_LIBXSMM "Build with LIBXSMM library optimization" ON)
...@@ -171,25 +169,7 @@ list(APPEND DGL_SRC ${DGL_RPC_SRC}) ...@@ -171,25 +169,7 @@ list(APPEND DGL_SRC ${DGL_RPC_SRC})
if(USE_CUDA) if(USE_CUDA)
dgl_config_cuda(DGL_CUDA_SRC) dgl_config_cuda(DGL_CUDA_SRC)
list(APPEND DGL_SRC ${DGL_CUDA_SRC}) list(APPEND DGL_SRC ${DGL_CUDA_SRC})
if(USE_NCCL)
add_definitions(-DDGL_USE_NCCL)
if (USE_SYSTEM_NCCL)
include(cmake/util/FindNccl.cmake)
include_directories(${NCCL_INCLUDE_DIR})
else()
include(cmake/modules/NCCL.cmake)
cuda_include_directories(BEFORE ${NCCL_INCLUDE_DIR})
endif()
endif(USE_NCCL)
list(APPEND DGL_LINKER_LIBS ${NCCL_LIBRARY})
endif(USE_CUDA)
if(USE_CUDA)
cuda_add_library(dgl SHARED ${DGL_SRC}) cuda_add_library(dgl SHARED ${DGL_SRC})
if (USE_NCCL AND NOT USE_SYSTEM_NCCL)
add_dependencies(dgl nccl_external)
endif()
else(USE_CUDA) else(USE_CUDA)
add_library(dgl SHARED ${DGL_SRC}) add_library(dgl SHARED ${DGL_SRC})
endif(USE_CUDA) endif(USE_CUDA)
......
include(ExternalProject)
# set path to submodule
set(NCCL_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/nccl")
# NCCL doesn't have CMAKE, so build externally
ExternalProject_Add(nccl_external
SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/nccl
BUILD_IN_SOURCE 1
CONFIGURE_COMMAND ""
BUILD_COMMAND
env
make
"src.build"
"-j"
"BUILDDIR=${NCCL_BUILD_DIR}"
BUILD_BYPRODUCTS "${NCCL_BUILD_DIR}/lib/libnccl_static.a"
INSTALL_COMMAND ""
)
# set output variables
set(NCCL_FOUND TRUE)
set(NCCL_LIBRARY "${NCCL_BUILD_DIR}/lib/libnccl_static.a")
set(NCCL_INCLUDE_DIR "${NCCL_BUILD_DIR}/include")
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Tries to find NCCL headers and libraries.
#
# Usage of this module as follows:
#
# find_package(NCCL)
#
# Variables used by this module, they can change the default behaviour and need
# to be set before calling find_package:
#
# NCCL_ROOT - When set, this path is inspected instead of standard library
# locations as the root of the NCCL installation.
# The environment variable NCCL_ROOT overrides this variable.
#
# This module defines
# Nccl_FOUND, whether nccl has been found
# NCCL_INCLUDE_DIR, directory containing header
# NCCL_LIBRARY, directory containing nccl library
# NCCL_LIB_NAME, nccl library name
# USE_NCCL_LIB_PATH, when set, NCCL_LIBRARY path is also inspected for the
# location of the nccl library. This would disable
# switching between static and shared.
#
# This module assumes that the user has already called find_package(CUDA)
#
# This file is from https://github.com/dmlc/xgboost, with modifications to
# check the version.
if (NCCL_LIBRARY)
if(NOT USE_NCCL_LIB_PATH)
# Don't cache NCCL_LIBRARY to enable switching between static and shared.
unset(NCCL_LIBRARY CACHE)
endif(NOT USE_NCCL_LIB_PATH)
endif()
if (BUILD_WITH_SHARED_NCCL)
# libnccl.so
set(NCCL_LIB_NAME nccl)
else ()
# libnccl_static.a
set(NCCL_LIB_NAME nccl_static)
endif (BUILD_WITH_SHARED_NCCL)
find_path(NCCL_INCLUDE_DIR
NAMES nccl.h
PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
# make sure it has point to point support
file(STRINGS "${NCCL_INCLUDE_DIR}/nccl.h" NCCL_VERSION_CODE REGEX "^#define[ \t]+NCCL_VERSION_CODE[ \t]+[0-9]+.*$" LIMIT_COUNT 1)
string(REGEX REPLACE "^.*NCCL_VERSION_CODE[ \t]+([0-9]+).*$" "\\1" NCCL_VERSION "${NCCL_VERSION_CODE}")
find_library(NCCL_LIBRARY
NAMES ${NCCL_LIB_NAME}
PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
if ("${NCCL_VERSION}" LESS "2700")
message(FATAL_ERROR "Require nccl >= 2700, but found ${NCCL_LIBRARY}==${NCCL_VERSION}")
else()
message(STATUS "Using nccl library: ${NCCL_LIBRARY} ${NCCL_VERSION}")
endif()
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Nccl DEFAULT_MSG
NCCL_INCLUDE_DIR NCCL_LIBRARY)
mark_as_advanced(
NCCL_INCLUDE_DIR
NCCL_LIBRARY
)
""" CUDA wrappers """ """ CUDA wrappers """
from . import nccl from .. import backend as F
if F.get_preferred_backend() == "pytorch":
from . import nccl
"""API creating NCCL communicators.""" """API wrapping NCCL primitives."""
from .. import backend as F import torch
from .._ffi.function import _init_api import torch.distributed as dist
_COMM_MODES_MAP = {"remainder": 0}
def sparse_all_to_all_push(idx, value, partition):
"""Perform an all-to-all-v operation, where by all processors send out
a set of indices and corresponding values. Indices and values,
corresponding to the current process, will copied into the output
arrays.
class UniqueId(object): Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
"""Class for allowing python code to create and communicate NCCL Unique
IDs, needed for creating communicators.
"""
def __init__(self, id_str=None): Parameters
"""Create an object reference the current NCCL unique id.""" ----------
if id_str: idx : torch.Tensor
if isinstance(id_str, bytes): The 1D set of indices to send to other processors.
id_str = id_str.decode("utf-8") value : torch.Tensor
self._handle = _CAPI_DGLNCCLUniqueIdFromString(id_str) The multi-dimension set of values to send to other processors.
else: The first dimension must match that of `idx`.
self._handle = _CAPI_DGLNCCLGetUniqueId() partition : NDArrayPartition
The object containing information for assigning indices to
def get(self): processors.
"""Get the C-handle for this object."""
return self._handle
def __str__(self):
return _CAPI_DGLNCCLUniqueIdToString(self._handle)
def __repr__(self):
return "UniqueId[{}]".format(str(self))
def __eq__(self, other):
return str(self) == str(other)
class Communicator(object):
"""High-level wrapper for NCCL communication."""
def __init__(self, size, rank, unique_id):
"""Create a new NCCL communicator.
Parameters
----------
size : int
The number of processes in the communicator.
rank : int
The rank of the current process in the communicator.
unique_id : NCCLUniqueId
The unique id of the root process (rank=0).
Examples
--------
>>> from dgl.cuda.nccl import Communicator, UniqueId
The root process will generate a unique NCCL id and communicate it
to the other processes.
>>> uid = UniqueId()
>>> store.set('nccl_root_id', str(uid))
And all other processes create unique ids from the root processes.
>>> uid = UniqueId(store.get('nccl_root_id'))
Then, all processes should create the communicator.
>>> comm = Communicator(world_size, rank, uid)
"""
assert rank < size, (
"The rank of a process must be less than the "
"size of the communicator."
)
self._handle = _CAPI_DGLNCCLCreateComm(size, rank, unique_id.get())
self._rank = rank
self._size = size
def sparse_all_to_all_push(self, idx, value, partition):
"""Perform an all-to-all-v operation, where by all processors send out
a set of indices and corresponding values. Indices and values,
corresponding to the current process, will copied into the output
arrays.
Parameters
----------
idx : tensor
The 1D set of indices to send to other processors.
value : tensor
The multi-dimension set of values to send to other processors.
The first dimension must match that of `idx`.
partition : NDArrayPartition
The object containing information for assigning indices to
processors.
Returns
-------
tensor
The 1D tensor of the recieved indices.
tensor
The set of recieved values.
Examples
--------
To perform a sparse_all_to_all_push(), a partition object must be
provided. A partition of a homgeonous graph, where the vertices are
striped across processes can be generated via:
>>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' )
With this partition, each processor can send values to be associatd
with vertices in the graph. So if we have an array `global_idxs` of all of
the neighbors updated during mini-batch processing, and an array
`global_values` containing the new values associated with the neighbors,
we communicate them to the own processes via:
>>> my_idxs, my_values = comm.sparse_all_to_all_push(global_idxs, global_values, part)
This communication pattern is common when communicating gradient
updates for node embeddings.
Indices the current process owns, do not need to treated specially,
as internally they will be copied to the output array. If we have a
set of indices in process 0 '[0, 3, 8, 9, 10]` and for process 1
'[0, 2, 4, 5, 8, 8, 9]'. Using a remainder partition will result
indices for processe 0 of '[0, 8, 10, 0, 2, 4, 8, 8]', and for
process 1 of '[3, 9, 5, 9]'.
"""
out_idx, out_value = _CAPI_DGLNCCLSparseAllToAllPush(
self.get(),
F.zerocopy_to_dgl_ndarray(idx),
F.zerocopy_to_dgl_ndarray(value),
partition.get(),
)
return (
F.zerocopy_from_dgl_ndarray(out_idx),
F.zerocopy_from_dgl_ndarray(out_value),
)
def sparse_all_to_all_pull(self, req_idx, value, partition):
"""Perform an all-to-all-v operation, where by all processors request
the values corresponding to their set of indices.
Parameters
----------
req_idx : IdArray
The set of indices this processor is requesting.
value : NDArray
The multi-dimension set of values that can be requested from
this processor.
partition : NDArrayPartition
The object containing information for assigning indices to
processors.
Returns
-------
tensor
The set of recieved values, corresponding to `req_idx`.
Examples
--------
To perform a sparse_all_to_all_pull(), a partition object must be
provided. A partition of a homgeonous graph, where the vertices are
striped across processes can be generated via:
>>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(g.num_nodes(), comm.size(), mode='remainder' )
With this partition, each processor can request values/features
associated with vertices in the graph. So in the case where we have
a set of neighbors 'nbr_idxs' we need features for, and each process
has a tensor 'node_feat' storing the features of nodes it owns in
the partition, the features can be requested via:
>>> nbr_values = comm.sparse_all_to_all_pull(nbr_idxs, node_feat, part)
Then two the arrays 'nbr_idxs' and 'nbr_values' forms the sparse
set of features, where 'nbr_idxs[i]' is the global node id, and
'nbr_values[i]' is the feature vector for that node. This
communication pattern is useful for node features or node
embeddings.
"""
out_value = _CAPI_DGLNCCLSparseAllToAllPull(
self.get(),
F.zerocopy_to_dgl_ndarray(req_idx),
F.zerocopy_to_dgl_ndarray(value),
partition.get(),
)
return F.zerocopy_from_dgl_ndarray(out_value)
def get(self):
"""Get the C-Handle for this object."""
return self._handle
def rank(self):
"""Get the rank of this process in this communicator.
Returns
-------
int
The rank of this process.
"""
return self._rank
def size(self):
"""Get the size of this communicator.
Returns
-------
int
The number of processes in this communicator.
"""
return self._size
def is_supported():
"""Check if DGL was built with NCCL support.
Returns Returns
------- -------
bool torch.Tensor
True if NCCL support was built in. The 1D tensor of the recieved indices.
torch.Tensor
The set of recieved values.
Examples
--------
To perform a sparse_all_to_all_push(), a partition object must be
provided. A partition of a homgeonous graph, where the vertices are
striped across processes can be generated via:
>>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')
With this partition, each processor can send values to be associatd
with vertices in the graph. So if we have an array `global_idxs` of all of
the neighbors updated during mini-batch processing, and an array
`global_values` containing the new values associated with the neighbors,
we communicate them to the own processes via:
>>> my_idxs, my_values = nccl.sparse_all_to_all_push(global_idxs, global_values, part)
This communication pattern is common when communicating gradient
updates for node embeddings.
Indices the current process owns, do not need to treated specially,
as internally they will be copied to the output array. If we have a
set of indices in process 0 '[0, 3, 8, 9, 10]` and for process 1
'[0, 2, 4, 5, 8, 8, 9]'. Using a remainder partition will result
indices for processe 0 of '[0, 8, 10, 0, 2, 4, 8, 8]', and for
process 1 of '[3, 9, 5, 9]'.
""" """
return _CAPI_DGLNCCLHasSupport() if not dist.is_initialized() or dist.get_world_size() == 1:
return idx, value
assert (
dist.get_backend() == "nccl"
), "requires NCCL backend to communicate CUDA tensors."
perm, send_splits = partition.generate_permutation(idx)
perm = perm.long()
# Get receive splits.
recv_splits = torch.empty_like(send_splits)
dist.all_to_all_single(recv_splits, send_splits)
# Use pinned memory to speedup D2H copy.
recv_splits = recv_splits.to("cpu", non_blocking=True)
send_splits = send_splits.to("cpu", non_blocking=True)
send_idx = idx[perm]
send_value = value[perm]
# Wait D2H copy finish.
torch.cuda.current_stream().synchronize()
recv_sum = recv_splits.sum()
recv_splits = recv_splits.tolist()
send_splits = send_splits.tolist()
# Send idx.
recv_idx = torch.empty((recv_sum,), dtype=idx.dtype, device=idx.device)
dist.all_to_all_single(recv_idx, send_idx, recv_splits, send_splits)
# Send value.
recv_value = torch.empty(
(recv_sum, *value.shape[1:]), dtype=value.dtype, device=value.device
)
dist.all_to_all_single(recv_value, send_value, recv_splits, send_splits)
return recv_idx, recv_value
def sparse_all_to_all_pull(req_idx, value, partition):
"""Perform an all-to-all-v operation, where by all processors request
the values corresponding to their set of indices.
Note: This method requires 'torch.distributed.get_backend() == "nccl"'.
Parameters
----------
req_idx : torch.Tensor
The set of indices this processor is requesting.
value : torch.Tensor
The multi-dimension set of values that can be requested from
this processor.
partition : NDArrayPartition
The object containing information for assigning indices to
processors.
Returns
-------
torch.Tensor
The set of recieved values, corresponding to `req_idx`.
Examples
--------
To perform a sparse_all_to_all_pull(), a partition object must be
provided. A partition of a homgeonous graph, where the vertices are
striped across processes can be generated via:
_init_api("dgl.cuda.nccl") >>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(g.num_nodes(), world_size, mode='remainder')
With this partition, each processor can request values/features
associated with vertices in the graph. So in the case where we have
a set of neighbors 'nbr_idxs' we need features for, and each process
has a tensor 'node_feat' storing the features of nodes it owns in
the partition, the features can be requested via:
>>> nbr_values = nccl.sparse_all_to_all_pull(nbr_idxs, node_feat, part)
Then two the arrays 'nbr_idxs' and 'nbr_values' forms the sparse
set of features, where 'nbr_idxs[i]' is the global node id, and
'nbr_values[i]' is the feature vector for that node. This
communication pattern is useful for node features or node
embeddings.
"""
if not dist.is_initialized() or dist.get_world_size() == 1:
return value[req_idx.long()]
assert (
dist.get_backend() == "nccl"
), "requires NCCL backend to communicate CUDA tensors."
perm, req_splits = partition.generate_permutation(req_idx)
perm = perm.long()
# Get response splits.
resp_splits = torch.empty_like(req_splits)
dist.all_to_all_single(resp_splits, req_splits)
# Use pinned memory to speedup D2H copy.
resp_splits = resp_splits.to("cpu", non_blocking=True)
req_splits = req_splits.to("cpu", non_blocking=True)
req_idx = req_idx[perm]
# Wait D2H copy finish.
torch.cuda.current_stream().synchronize()
resp_sum = resp_splits.sum()
resp_splits = resp_splits.tolist()
req_splits = req_splits.tolist()
# Gather requested indices.
resp_idx = torch.empty(
(resp_sum,), dtype=req_idx.dtype, device=req_idx.device
)
dist.all_to_all_single(resp_idx, req_idx, resp_splits, req_splits)
# Convert requested indices to local indices depending on partition.
if resp_sum > 0:
resp_idx = partition.map_to_local(resp_idx)
# Collect the request value.
req_value = torch.empty(
(req_idx.size(0), *value.shape[1:]),
dtype=value.dtype,
device=value.device,
)
dist.all_to_all_single(req_value, value[resp_idx], req_splits, resp_splits)
# Permute the value back into the requested order.
return_value = torch.empty_like(req_value)
return_value[perm] = req_value
return return_value
...@@ -9,7 +9,6 @@ from ...partition import NDArrayPartition ...@@ -9,7 +9,6 @@ from ...partition import NDArrayPartition
from ...utils import create_shared_mem_array, get_shared_mem_array from ...utils import create_shared_mem_array, get_shared_mem_array
_STORE = None _STORE = None
_COMM = None
class NodeEmbedding: # NodeEmbedding class NodeEmbedding: # NodeEmbedding
...@@ -78,7 +77,6 @@ class NodeEmbedding: # NodeEmbedding ...@@ -78,7 +77,6 @@ class NodeEmbedding: # NodeEmbedding
partition=None, partition=None,
): ):
global _STORE global _STORE
global _COMM
if device is None: if device is None:
device = th.device("cpu") device = th.device("cpu")
...@@ -132,25 +130,7 @@ class NodeEmbedding: # NodeEmbedding ...@@ -132,25 +130,7 @@ class NodeEmbedding: # NodeEmbedding
) )
self._tensor = emb self._tensor = emb
else: # embeddings is stored in GPU memory. else: # embeddings is stored in GPU memory.
# setup nccl communicator self._comm = True
if _COMM is None:
if rank < 0:
_COMM = nccl.Communicator(1, 0, nccl.UniqueId())
else:
# needs to be set for nccl to work
th.cuda.set_device(device)
if rank == 0:
# root process broadcasts nccl id
nccl_id = nccl.UniqueId()
self._store.set("nccl_root_id_sparse_emb", str(nccl_id))
else:
nccl_id = nccl.UniqueId(
self._store.get("nccl_root_id_sparse_emb")
)
_COMM = nccl.Communicator(
self._world_size, self._rank, nccl_id
)
self._comm = _COMM
if not self._partition: if not self._partition:
# for communication we need a partition # for communication we need a partition
...@@ -161,7 +141,7 @@ class NodeEmbedding: # NodeEmbedding ...@@ -161,7 +141,7 @@ class NodeEmbedding: # NodeEmbedding
) )
# create local tensors for the weights # create local tensors for the weights
local_size = self._partition.local_size(self._comm.rank()) local_size = self._partition.local_size(max(self._rank, 0))
# TODO(dlasalle): support 16-bit/half embeddings # TODO(dlasalle): support 16-bit/half embeddings
emb = th.empty( emb = th.empty(
...@@ -187,15 +167,15 @@ class NodeEmbedding: # NodeEmbedding ...@@ -187,15 +167,15 @@ class NodeEmbedding: # NodeEmbedding
device : th.device device : th.device
Target device to put the collected embeddings. Target device to put the collected embeddings.
""" """
if not self._comm or self._comm.size() == 1: if not self._comm:
# embeddings are stored on the CPU
emb = self._tensor[node_ids].to(device) emb = self._tensor[node_ids].to(device)
else: else:
if self.world_size > 0: # embeddings are stored on the GPU
emb = self._comm.sparse_all_to_all_pull( # the following method also covers self._world_size = 0 or 1
node_ids, self._tensor, self._partition emb = nccl.sparse_all_to_all_pull(
) node_ids, self._tensor, self._partition
else: )
emb = self._tensor[node_ids]
emb = emb.to(device) emb = emb.to(device)
if F.is_recording(): if F.is_recording():
emb = F.attach_grad(emb) emb = F.attach_grad(emb)
...@@ -215,18 +195,6 @@ class NodeEmbedding: # NodeEmbedding ...@@ -215,18 +195,6 @@ class NodeEmbedding: # NodeEmbedding
""" """
return self._store return self._store
@property
def comm(self):
"""Return dgl.cuda.nccl.Communicator for data
sharing across processes.
Returns
-------
dgl.cuda.nccl.Communicator
Communicator used for data sharing.
"""
return self._comm
@property @property
def partition(self): def partition(self):
"""Return the partition identifying how the tensor is split across """Return the partition identifying how the tensor is split across
...@@ -361,7 +329,8 @@ class NodeEmbedding: # NodeEmbedding ...@@ -361,7 +329,8 @@ class NodeEmbedding: # NodeEmbedding
if self._partition: if self._partition:
idxs = F.copy_to( idxs = F.copy_to(
self._partition.get_local_indices( self._partition.get_local_indices(
self._comm.rank(), ctx=F.context(self._tensor) max(self._rank, 0),
ctx=F.context(self._tensor),
), ),
F.context(values), F.context(values),
) )
......
...@@ -63,7 +63,6 @@ class SparseGradOptimizer(abc.ABC): ...@@ -63,7 +63,6 @@ class SparseGradOptimizer(abc.ABC):
), "MultiGPU world_size for each embedding should be same." ), "MultiGPU world_size for each embedding should be same."
assert not self._rank is None assert not self._rank is None
assert not self._world_size is None assert not self._world_size is None
self._nccl_root_id = "SparseGradOptimizer.nccl_root_id"
def step(self): def step(self):
"""The step function. """The step function.
...@@ -74,7 +73,7 @@ class SparseGradOptimizer(abc.ABC): ...@@ -74,7 +73,7 @@ class SparseGradOptimizer(abc.ABC):
if self._first_step: if self._first_step:
for emb in self._params: for emb in self._params:
for _, data in emb._trace: for _, data in emb._trace:
if data.grad.data.device.type == "cuda": if data.grad.device.type == "cuda":
# create a communicator # create a communicator
if self._device: if self._device:
assert ( assert (
...@@ -116,27 +115,7 @@ class SparseGradOptimizer(abc.ABC): ...@@ -116,27 +115,7 @@ class SparseGradOptimizer(abc.ABC):
""" """
def _comm_setup(self): def _comm_setup(self):
# find a store to communicate the unique id through self._comm = True
if len(self._params) > 0:
store = self._params[0].store
if self._rank < 0:
self._comm = nccl.Communicator(1, 0, nccl.UniqueId())
else:
th.cuda.set_device(self._device)
if self._rank == 0:
# root process broadcasts nccl id
nccl_id = nccl.UniqueId()
uid = str(nccl_id)
store.set(self._nccl_root_id, uid)
else:
uid = store.get(self._nccl_root_id)
nccl_id = nccl.UniqueId(uid)
# needs to be set for nccl to work
self._comm = nccl.Communicator(
self._world_size, self._rank, nccl_id
)
th.distributed.barrier()
def _shared_setup(self): def _shared_setup(self):
for emb in self._params: for emb in self._params:
...@@ -162,7 +141,6 @@ class SparseGradOptimizer(abc.ABC): ...@@ -162,7 +141,6 @@ class SparseGradOptimizer(abc.ABC):
self._opt_meta[emb_name] = opt_meta self._opt_meta[emb_name] = opt_meta
def _comm_step(self): def _comm_step(self):
comm = self._comm
with th.no_grad(): with th.no_grad():
idx_in = {} idx_in = {}
grad_in = {} grad_in = {}
...@@ -203,7 +181,7 @@ class SparseGradOptimizer(abc.ABC): ...@@ -203,7 +181,7 @@ class SparseGradOptimizer(abc.ABC):
( (
idx_in[emb_name], idx_in[emb_name],
grad_in[emb_name], grad_in[emb_name],
) = comm.sparse_all_to_all_push(idx, grad, partition=partition) ) = nccl.sparse_all_to_all_push(idx, grad, partition=partition)
if emb.partition: if emb.partition:
# if the embedding is partitioned, map back to indexes # if the embedding is partitioned, map back to indexes
# into the local tensor # into the local tensor
......
...@@ -592,5 +592,44 @@ class NDArrayPartition(object): ...@@ -592,5 +592,44 @@ class NDArrayPartition(object):
) )
) )
def generate_permutation(self, idxs):
"""Produce a scheme that maps the given indices to separate partitions
and the counts of how many indices are in each partition.
Parameters
----------
idxs: torch.Tensor.
A tensor with shape (`num_indices`,), representing global indices.
Return
------
torch.Tensor.
A tensor with shape (`num_indices`,), representing the permutation
to re-order the indices by partition.
torch.Tensor.
A tensor with shape (`num_partition`,), representing the number of
indices per partition.
Examples
--------
>>> import torch
>>> from dgl.partition import NDArrayPartition
>>> part = NDArrayPartition(10, 2, mode="remainder")
>>> idx = torch.tensor([0, 2, 4, 5, 8, 8, 9], device="cuda:0")
>>> perm, splits_sum = part.generate_permutation(idx)
>>> perm
tensor([0, 1, 2, 4, 5, 3, 6], device='cuda:0')
>>> splits_sum
tensor([5, 2], device='cuda:0')
"""
ret = _CAPI_DGLNDArrayPartitionGeneratePermutation(
self._partition, F.zerocopy_to_dgl_ndarray(idxs)
)
return F.zerocopy_from_dgl_ndarray(ret(0)), F.zerocopy_from_dgl_ndarray(
ret(1)
)
_init_api("dgl.partition") _init_api("dgl.partition")
/** /**
* Copyright (c) 2018 by Contributors * Copyright (c) 2018 by Contributors
* @file c_runtime_api.cc * @file c_api_common.cc
* @brief DGL C API common implementations * @brief DGL C API common implementations
*/ */
#include "c_api_common.h" #include "c_api_common.h"
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <memory> #include <memory>
#include <utility> #include <utility>
#include "../c_api_common.h"
#include "partition_op.h" #include "partition_op.h"
using namespace dgl::runtime; using namespace dgl::runtime;
...@@ -251,5 +252,15 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal") ...@@ -251,5 +252,15 @@ DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionMapToGlobal")
*rv = part->MapToGlobal(idxs, part_id); *rv = part->MapToGlobal(idxs, part_id);
}); });
DGL_REGISTER_GLOBAL("partition._CAPI_DGLNDArrayPartitionGeneratePermutation")
.set_body([](DGLArgs args, DGLRetValue* rv) {
NDArrayPartitionRef part = args[0];
IdArray idxs = args[1];
std::pair<IdArray, NDArray> part_perm = part->GeneratePermutation(idxs);
*rv =
ConvertNDArrayVectorToPackedFunc({part_perm.first, part_perm.second});
});
} // namespace partition } // namespace partition
} // namespace dgl } // namespace dgl
This diff is collapsed.
/**
* Copyright (c) 2021-2022 by Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* @file nccl_api.h
* @brief Wrapper around NCCL routines.
*/
#ifndef DGL_RUNTIME_CUDA_NCCL_API_H_
#define DGL_RUNTIME_CUDA_NCCL_API_H_
#ifdef DGL_USE_NCCL
#include "nccl.h"
#else
// if not compiling with NCCL, this class will only support communicators of
// size 1.
#define NCCL_UNIQUE_ID_BYTES 128
typedef struct {
char internal[NCCL_UNIQUE_ID_BYTES];
} ncclUniqueId;
typedef int ncclComm_t;
#endif
#include <dgl/runtime/object.h>
#include <string>
namespace dgl {
namespace runtime {
namespace cuda {
class NCCLUniqueId : public runtime::Object {
public:
NCCLUniqueId();
static constexpr const char* _type_key = "cuda.NCCLUniqueId";
DGL_DECLARE_OBJECT_TYPE_INFO(NCCLUniqueId, Object);
ncclUniqueId Get() const;
std::string ToString() const;
void FromString(const std::string& str);
private:
ncclUniqueId id_;
};
DGL_DEFINE_OBJECT_REF(NCCLUniqueIdRef, NCCLUniqueId);
class NCCLCommunicator : public runtime::Object {
public:
NCCLCommunicator(int size, int rank, ncclUniqueId id);
~NCCLCommunicator();
// disable copying
NCCLCommunicator(const NCCLCommunicator& other) = delete;
NCCLCommunicator& operator=(const NCCLCommunicator& other);
ncclComm_t Get();
/**
* @brief Perform an all-to-all communication.
*
* @param send The continous array of data to send.
* @param recv The continous array of data to recieve.
* @param count The size of data to send to each rank.
* @param stream The stream to operate on.
*/
template <typename IdType>
void AllToAll(
const IdType* send, IdType* recv, int64_t count, cudaStream_t stream);
/**
* @brief Perform an all-to-all variable sized communication.
*
* @tparam DType The type of value to send.
* @param send The arrays of data to send.
* @param send_prefix The prefix of each array to send.
* @param recv The arrays of data to recieve.
* @param recv_prefix The prefix of each array to recieve.
* @param type The type of data to send.
* @param stream The stream to operate on.
*/
template <typename DType>
void AllToAllV(
const DType* const send, const int64_t* send_prefix, DType* const recv,
const int64_t* recv_prefix, cudaStream_t stream);
/**
* @brief Perform an all-to-all with sparse data (idx and value pairs). By
* necessity, the sizes of each message are variable.
*
* @tparam IdType The type of index.
* @tparam DType The type of value.
* @param send_idx The set of indexes to send on the device.
* @param send_value The set of values to send on the device.
* @param num_feat The number of values per index.
* @param send_prefix The exclusive prefix sum of elements to send on the
* host.
* @param recv_idx The set of indexes to recieve on the device.
* @param recv_value The set of values to recieve on the device.
* @param recv_prefix The exclusive prefix sum of the number of elements to
* recieve on the host.
* @param stream The stream to communicate on.
*/
template <typename IdType, typename DType>
void SparseAllToAll(
const IdType* send_idx, const DType* send_value, const int64_t num_feat,
const int64_t* send_prefix, IdType* recv_idx, DType* recv_value,
const int64_t* recv_prefix, cudaStream_t stream);
int size() const;
int rank() const;
static constexpr const char* _type_key = "cuda.NCCLCommunicator";
DGL_DECLARE_OBJECT_TYPE_INFO(NCCLCommunicator, Object);
private:
ncclComm_t comm_;
int size_;
int rank_;
};
DGL_DEFINE_OBJECT_REF(NCCLCommunicatorRef, NCCLCommunicator);
} // namespace cuda
} // namespace runtime
} // namespace dgl
#endif // DGL_RUNTIME_CUDA_NCCL_API_H_
...@@ -24,11 +24,13 @@ def test_get_node_partition_from_book(idtype): ...@@ -24,11 +24,13 @@ def test_get_node_partition_from_book(idtype):
assert partition.num_parts() == 3 assert partition.num_parts() == 3
assert partition.array_size() == 11 assert partition.array_size() == 11
# Test map_to_local
test_ids = F.copy_to(F.tensor([0, 2, 6, 7, 10], dtype=idtype), F.ctx()) test_ids = F.copy_to(F.tensor([0, 2, 6, 7, 10], dtype=idtype), F.ctx())
act_ids = partition.map_to_local(test_ids) act_ids = partition.map_to_local(test_ids)
exp_ids = F.copy_to(F.tensor([0, 2, 0, 1, 4], dtype=idtype), F.ctx()) exp_ids = F.copy_to(F.tensor([0, 2, 0, 1, 4], dtype=idtype), F.ctx())
assert F.array_equal(act_ids, exp_ids) assert F.array_equal(act_ids, exp_ids)
# Test map_to_global
test_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx()) test_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
act_ids = partition.map_to_global(test_ids, 0) act_ids = partition.map_to_global(test_ids, 0)
exp_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx()) exp_ids = F.copy_to(F.tensor([0, 2], dtype=idtype), F.ctx())
...@@ -43,3 +45,11 @@ def test_get_node_partition_from_book(idtype): ...@@ -43,3 +45,11 @@ def test_get_node_partition_from_book(idtype):
act_ids = partition.map_to_global(test_ids, 2) act_ids = partition.map_to_global(test_ids, 2)
exp_ids = F.copy_to(F.tensor([6, 7, 10], dtype=idtype), F.ctx()) exp_ids = F.copy_to(F.tensor([6, 7, 10], dtype=idtype), F.ctx())
assert F.array_equal(act_ids, exp_ids) assert F.array_equal(act_ids, exp_ids)
# Test generate_permutation
test_ids = F.copy_to(F.tensor([6, 0, 7, 2, 10], dtype=idtype), F.ctx())
perm, split_sum = partition.generate_permutation(test_ids)
exp_perm = F.copy_to(F.tensor([1, 3, 0, 2, 4], dtype=idtype), F.ctx())
exp_sum = F.copy_to(F.tensor([2, 0, 3]), F.ctx())
assert F.array_equal(perm, exp_perm)
assert F.array_equal(split_sum, exp_sum)
import unittest import unittest
import backend as F import backend as F
import torch
import torch.distributed as dist
from dgl.cuda import nccl from dgl.cuda import nccl
from dgl.partition import NDArrayPartition from dgl.partition import NDArrayPartition
def gen_test_id():
return "{:0256x}".format(78236728318467363)
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_id():
nccl_id = nccl.UniqueId()
text = str(nccl_id)
nccl_id2 = nccl.UniqueId(id_str=text)
assert nccl_id == nccl_id2
nccl_id2 = nccl.UniqueId(gen_test_id())
assert nccl_id2 != nccl_id
nccl_id3 = nccl.UniqueId(str(nccl_id2))
assert nccl_id2 == nccl_id3
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU." F._default_context_str == "cpu", reason="NCCL only runs on GPU."
) )
def test_nccl_sparse_push_single_remainder(): def test_nccl_sparse_push_single_remainder():
nccl_id = nccl.UniqueId() torch.cuda.set_device("cuda:0")
comm = nccl.Communicator(1, 0, nccl_id) dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
index = F.randint([10000], F.int32, F.ctx(), 0, 10000) index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0) value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)
part = NDArrayPartition(10000, 1, "remainder") part = NDArrayPartition(10000, 1, "remainder")
ri, rv = comm.sparse_all_to_all_push(index, value, part) ri, rv = nccl.sparse_all_to_all_push(index, value, part)
assert F.array_equal(ri, index) assert F.array_equal(ri, index)
assert F.array_equal(rv, value) assert F.array_equal(rv, value)
dist.destroy_process_group()
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU." F._default_context_str == "cpu", reason="NCCL only runs on GPU."
) )
def test_nccl_sparse_pull_single_remainder(): def test_nccl_sparse_pull_single_remainder():
nccl_id = nccl.UniqueId() torch.cuda.set_device("cuda:0")
comm = nccl.Communicator(1, 0, nccl_id) dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000) req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0) value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)
part = NDArrayPartition(100000, 1, "remainder") part = NDArrayPartition(100000, 1, "remainder")
rv = comm.sparse_all_to_all_pull(req_index, value, part) rv = nccl.sparse_all_to_all_pull(req_index, value, part)
exp_rv = F.gather_row(value, req_index) exp_rv = F.gather_row(value, req_index)
assert F.array_equal(rv, exp_rv) assert F.array_equal(rv, exp_rv)
dist.destroy_process_group()
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU." F._default_context_str == "cpu", reason="NCCL only runs on GPU."
) )
def test_nccl_sparse_push_single_range(): def test_nccl_sparse_push_single_range():
nccl_id = nccl.UniqueId() torch.cuda.set_device("cuda:0")
comm = nccl.Communicator(1, 0, nccl_id) dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
index = F.randint([10000], F.int32, F.ctx(), 0, 10000) index = F.randint([10000], F.int32, F.ctx(), 0, 10000)
value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0) value = F.uniform([10000, 100], F.float32, F.ctx(), -1.0, 1.0)
...@@ -78,17 +76,24 @@ def test_nccl_sparse_push_single_range(): ...@@ -78,17 +76,24 @@ def test_nccl_sparse_push_single_range():
) )
part = NDArrayPartition(10000, 1, "range", part_ranges=part_ranges) part = NDArrayPartition(10000, 1, "range", part_ranges=part_ranges)
ri, rv = comm.sparse_all_to_all_push(index, value, part) ri, rv = nccl.sparse_all_to_all_push(index, value, part)
assert F.array_equal(ri, index) assert F.array_equal(ri, index)
assert F.array_equal(rv, value) assert F.array_equal(rv, value)
dist.destroy_process_group()
@unittest.skipIf( @unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU." F._default_context_str == "cpu", reason="NCCL only runs on GPU."
) )
def test_nccl_sparse_pull_single_range(): def test_nccl_sparse_pull_single_range():
nccl_id = nccl.UniqueId() torch.cuda.set_device("cuda:0")
comm = nccl.Communicator(1, 0, nccl_id) dist.init_process_group(
backend="nccl",
init_method="tcp://127.0.0.1:12345",
world_size=1,
rank=0,
)
req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000) req_index = F.randint([10000], F.int64, F.ctx(), 0, 100000)
value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0) value = F.uniform([100000, 100], F.float32, F.ctx(), -1.0, 1.0)
...@@ -98,21 +103,15 @@ def test_nccl_sparse_pull_single_range(): ...@@ -98,21 +103,15 @@ def test_nccl_sparse_pull_single_range():
) )
part = NDArrayPartition(100000, 1, "range", part_ranges=part_ranges) part = NDArrayPartition(100000, 1, "range", part_ranges=part_ranges)
rv = comm.sparse_all_to_all_pull(req_index, value, part) rv = nccl.sparse_all_to_all_pull(req_index, value, part)
exp_rv = F.gather_row(value, req_index) exp_rv = F.gather_row(value, req_index)
assert F.array_equal(rv, exp_rv) assert F.array_equal(rv, exp_rv)
dist.destroy_process_group()
@unittest.skipIf(
F._default_context_str == "cpu", reason="NCCL only runs on GPU."
)
def test_nccl_support():
# this is just a smoke test, as we don't have any other way to know
# if NCCL support is compiled in right now.
nccl.is_supported()
if __name__ == "__main__": if __name__ == "__main__":
test_nccl_id() test_nccl_sparse_push_single_remainder()
test_nccl_sparse_push_single() test_nccl_sparse_pull_single_remainder()
test_nccl_sparse_pull_single() test_nccl_sparse_push_single_range()
test_nccl_sparse_pull_single_range()
...@@ -28,7 +28,7 @@ if [[ $arch == *"x86"* ]]; then ...@@ -28,7 +28,7 @@ if [[ $arch == *"x86"* ]]; then
fi fi
if [[ $1 != "cpu" ]]; then if [[ $1 != "cpu" ]]; then
CMAKE_VARS="-DUSE_CUDA=ON -DUSE_NCCL=ON $CMAKE_VARS" CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
fi fi
if [ -d build ]; then if [ -d build ]; then
......
Subproject commit e11238b3029795d33f958b5868d47c90c4f22628
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment