Unverified Commit 8e7795e1 authored by vasunvidia's avatar vasunvidia Committed by GitHub
Browse files

Add NVLS-MC based UB kernels (#721)



Fix license, and sign off everything
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
Co-authored-by: default avatarVasudevan Rengasamy <vrengasamy@nvidia.com>
parent 47276e1b
......@@ -19,6 +19,7 @@ endif()
message(STATUS "Found GDRCopy: ${GDRCOPY_LIBRARY}")
target_link_libraries(transformer_engine_userbuffers PUBLIC
CUDA::cudart
CUDA::cuda_driver
MPI::MPI_CXX
${GDRCOPY_LIBRARY})
target_include_directories(transformer_engine_userbuffers PRIVATE
......
/*************************************************************************
* Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
************************************************************************/
#include "ipcsocket.h"
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#define WARN(...) \
{}
#define TRACE(...) \
{}
#define SYSCHECK(...) \
{}
#define EQCHECK(...) \
{}
// Enable Linux abstract socket naming
#define USE_ABSTRACT_SOCKET
#define NCCL_IPC_SOCKNAME_STR "/tmp/nccl-socket-%d-%lx"
/*
* Create a Unix Domain Socket
*/
ncclResult_t ncclIpcSocketInit(ncclIpcSocket *handle, int rank, uint64_t hash,
volatile uint32_t *abortFlag) {
int fd = -1;
struct sockaddr_un cliaddr;
char temp[NCCL_IPC_SOCKNAME_LEN] = "";
if (handle == NULL) {
return ncclInternalError;
}
handle->fd = -1;
handle->socketName[0] = '\0';
if ((fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) {
WARN("UDS: Socket creation error : %s (%d)", strerror(errno), errno);
return ncclSystemError;
}
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
// Create unique name for the socket.
int len =
snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot bind provided name to socket. Name too large");
return ncclInternalError;
}
#ifndef USE_ABSTRACT_SOCKET
unlink(temp);
#endif
TRACE(NCCL_INIT, "UDS: Creating socket %s", temp);
strncpy(cliaddr.sun_path, temp, len);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
if (bind(fd, (struct sockaddr *)&cliaddr, sizeof(cliaddr)) < 0) {
WARN("UDS: Binding to socket %s failed : %s (%d)", temp, strerror(errno),
errno);
close(fd);
return ncclSystemError;
}
handle->fd = fd;
strcpy(handle->socketName, temp); // NOLINT(*)
handle->abortFlag = abortFlag;
// Mark socket as non-blocking
if (handle->abortFlag) {
int flags;
EQCHECK(flags = fcntl(fd, F_GETFL), -1);
SYSCHECK(fcntl(fd, F_SETFL, flags | O_NONBLOCK), "fcntl");
}
return ncclSuccess;
}
ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket *handle, int *fd) {
if (handle == NULL) {
WARN("ncclSocketGetFd: pass NULL socket");
return ncclInvalidArgument;
}
if (fd)
*fd = handle->fd;
return ncclSuccess;
}
ncclResult_t ncclIpcSocketClose(ncclIpcSocket *handle) {
if (handle == NULL) {
return ncclInternalError;
}
if (handle->fd <= 0) {
return ncclSuccess;
}
#ifndef USE_ABSTRACT_SOCKET
if (handle->socketName[0] != '\0') {
unlink(handle->socketName);
}
#endif
close(handle->fd);
return ncclSuccess;
}
ncclResult_t ncclIpcSocketRecvMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
int *recvFd) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
// Union to guarantee alignment requirements for control array
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr *cmptr;
char dummy_buffer[1];
int ret;
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
if (hdr == NULL) {
iov[0].iov_base = reinterpret_cast<void *>(dummy_buffer);
iov[0].iov_len = sizeof(dummy_buffer);
} else {
iov[0].iov_base = hdr;
iov[0].iov_len = hdrLen;
}
msg.msg_iov = iov;
msg.msg_iovlen = 1;
while ((ret = recvmsg(handle->fd, &msg, 0)) <= 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Receiving data over socket failed : %d", errno);
return ncclSystemError;
}
if (handle->abortFlag && *handle->abortFlag)
return ncclInternalError;
}
if (recvFd != NULL) {
if (((cmptr = CMSG_FIRSTHDR(&msg)) != NULL) &&
(cmptr->cmsg_len == CMSG_LEN(sizeof(int)))) {
if ((cmptr->cmsg_level != SOL_SOCKET) ||
(cmptr->cmsg_type != SCM_RIGHTS)) {
WARN("UDS: Receiving data over socket failed");
return ncclSystemError;
}
memmove(recvFd, CMSG_DATA(cmptr), sizeof(*recvFd));
} else {
WARN("UDS: Receiving data over socket %s failed", handle->socketName);
return ncclSystemError;
}
TRACE(NCCL_INIT | NCCL_P2P, "UDS: Got recvFd %d from socket %s", *recvFd,
handle->socketName);
}
return ncclSuccess;
}
ncclResult_t ncclIpcSocketRecvFd(ncclIpcSocket *handle, int *recvFd) {
return ncclIpcSocketRecvMsg(handle, NULL, 0, recvFd);
}
ncclResult_t ncclIpcSocketSendMsg(ncclIpcSocket *handle, void *hdr, int hdrLen,
const int sendFd, int rank, uint64_t hash) {
struct msghdr msg = {0, 0, 0, 0, 0, 0, 0};
struct iovec iov[1];
char temp[NCCL_IPC_SOCKNAME_LEN];
union {
struct cmsghdr cm;
char control[CMSG_SPACE(sizeof(int))];
} control_un;
struct cmsghdr *cmptr;
char dummy_buffer[1];
struct sockaddr_un cliaddr;
// Construct client address to send this shareable handle to
bzero(&cliaddr, sizeof(cliaddr));
cliaddr.sun_family = AF_UNIX;
int len =
snprintf(temp, NCCL_IPC_SOCKNAME_LEN, NCCL_IPC_SOCKNAME_STR, rank, hash);
if (len > (sizeof(cliaddr.sun_path) - 1)) {
WARN("UDS: Cannot connect to provided name for socket. Name too large");
return ncclInternalError;
}
(void)strncpy(cliaddr.sun_path, temp, len);
#ifdef USE_ABSTRACT_SOCKET
cliaddr.sun_path[0] = '\0'; // Linux abstract socket trick
#endif
TRACE(NCCL_INIT, "UDS: Sending hdr %p len %d to UDS socket %s", hdr, hdrLen,
temp);
if (sendFd != -1) {
TRACE(NCCL_INIT, "UDS: Sending fd %d to UDS socket %s", sendFd, temp);
msg.msg_control = control_un.control;
msg.msg_controllen = sizeof(control_un.control);
cmptr = CMSG_FIRSTHDR(&msg);
cmptr->cmsg_len = CMSG_LEN(sizeof(int));
cmptr->cmsg_level = SOL_SOCKET;
cmptr->cmsg_type = SCM_RIGHTS;
memmove(CMSG_DATA(cmptr), &sendFd, sizeof(sendFd));
}
msg.msg_name = reinterpret_cast<void *>(&cliaddr);
msg.msg_namelen = sizeof(struct sockaddr_un);
if (hdr == NULL) {
iov[0].iov_base = reinterpret_cast<void *>(dummy_buffer);
iov[0].iov_len = sizeof(dummy_buffer);
} else {
iov[0].iov_base = hdr;
iov[0].iov_len = hdrLen;
}
msg.msg_iov = iov;
msg.msg_iovlen = 1;
msg.msg_flags = 0;
ssize_t sendResult;
while ((sendResult = sendmsg(handle->fd, &msg, 0)) < 0) {
if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) {
WARN("UDS: Sending data over socket %s failed : %s (%d)", temp,
strerror(errno), errno);
return ncclSystemError;
}
if (handle->abortFlag && *handle->abortFlag)
return ncclInternalError;
}
return ncclSuccess;
}
ncclResult_t ncclIpcSocketSendFd(ncclIpcSocket *handle, const int sendFd,
int rank, uint64_t hash) {
return ncclIpcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash);
}
/*************************************************************************
* Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See LICENSE for license information.
************************************************************************/
#ifndef NCCL_IPCSOCKET_H
#define NCCL_IPCSOCKET_H
// #include "nccl.h"
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <memory.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/un.h>
#include <sys/wait.h>
#include <unistd.h>
typedef enum {
ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6,
ncclInProgress = 7,
ncclNumResults = 8
} ncclResult_t;
#define NCCL_IPC_SOCKNAME_LEN 64
struct ncclIpcSocket {
int fd;
char socketName[NCCL_IPC_SOCKNAME_LEN];
volatile uint32_t *abortFlag;
};
ncclResult_t ncclIpcSocketInit(struct ncclIpcSocket *handle, int rank,
uint64_t hash, volatile uint32_t *abortFlag);
ncclResult_t ncclIpcSocketClose(struct ncclIpcSocket *handle);
ncclResult_t ncclIpcSocketGetFd(struct ncclIpcSocket *handle, int *fd);
ncclResult_t ncclIpcSocketRecvFd(struct ncclIpcSocket *handle, int *fd);
ncclResult_t ncclIpcSocketSendFd(struct ncclIpcSocket *handle, const int fd,
int rank, uint64_t hash);
#endif /* NCCL_IPCSOCKET_H */
......@@ -4,6 +4,8 @@
* See LICENSE for license information.
************************************************************************/
#include "ipcsocket.cc"
#include "ipcsocket.h"
#include "userbuffers.h"
#include <assert.h>
#include <chrono>
......@@ -18,6 +20,7 @@
#include <string.h>
#include <unistd.h>
#include <x86intrin.h>
#define MULTICAST_GB_TOTAL 512
static int oob_bcast(void *comm_context, void *buf, int size, int root) {
MPI_Bcast(buf, size, MPI_BYTE, root,
......@@ -63,6 +66,22 @@ int stringCmp(const void *a, const void *b) { return strcmp((const char *)a, (co
throw std::runtime_error(std::string(__FILE__ ":") + std::to_string(__LINE__) + \
" in function " + __func__ + ": " + x); \
} while (false)
#define NCCLCHECK(cmd) \
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("Failed, NCCL error %s:%d ''\n", __FILE__, __LINE__ /*,ncclGetErrorString(r)*/); \
exit(EXIT_FAILURE); \
} \
} while (0)
#define NCCLCHECKGOTO(call, RES, label) \
do { \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
goto label; \
} \
} while (0);
int pipe_rank(communicator *comm, int step) {
int mynode = comm->myrank / comm->nvsize;
......@@ -267,6 +286,73 @@ int create_communicator_grouped2(communicator **comm, int pipegpus, int pipenode
(*comm)->ibnvsize = (*comm)->nvsize;
#define NBUF 2
if ((*comm)->sm_arch >= 9 && (*comm)->ar2_nvsize > 1 &&
!getenv("UB_SKIPMC")) { // multicast init only for TP ops (____2 operations)
size_t mc_maxsize = MULTICAST_GB_TOTAL * (1ull << 30);
(*comm)->mc_offset = 0;
(*comm)->use_mc = 1;
size_t gran;
CUmulticastObjectProp mcProp = {};
mcProp.numDevices = (*comm)->ar2_nvsize;
mcProp.size = (*comm)->mc_maxsize;
mcProp.handleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
CUCHECK(cuMulticastGetGranularity(&gran, &mcProp, CU_MULTICAST_GRANULARITY_RECOMMENDED));
mc_maxsize = ((mc_maxsize + gran - 1) / gran) * gran;
mcProp.size = mc_maxsize;
(*comm)->mc_maxsize = mc_maxsize;
int fd;
volatile uint32_t abortFlag = 0;
struct ncclIpcSocket ipcSock = {0};
uint64_t opId = 0xdeadcafeb000 + (*comm)->ar2_firstgpu;
ncclResult_t ret = ncclSuccess;
NCCLCHECK(ncclIpcSocketInit(&ipcSock, (*comm)->ar2_nvrank, (uint64_t)opId, &abortFlag));
MPI_Barrier(MPI_COMM_WORLD);
if ((*comm)->ar2_nvrank == 0) {
CUCHECK(cuMulticastCreate(&(*comm)->mc_handle, &mcProp));
CUCHECK(cuMemExportToShareableHandle(&fd, (*comm)->mc_handle,
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
for (int p = 1; p < (*comm)->ar2_nvsize; p++) {
MPI_Barrier((*comm)->comm_intra);
NCCLCHECKGOTO(ncclIpcSocketSendFd(&ipcSock, fd, p, (uint64_t)opId), ret, error);
}
} else {
for (int i = 0; i < (*comm)->ar2_nvrank; i++)
MPI_Barrier((*comm)->comm_intra);
NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, &fd), ret, error);
for (int i = 0; i < (*comm)->ar2_nvsize - (*comm)->ar2_nvrank - 1; i++)
MPI_Barrier((*comm)->comm_intra);
CUCHECK(cuMemImportFromShareableHandle(&(*comm)->mc_handle, reinterpret_cast<void *>(fd),
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
}
error:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
close(fd);
CUCHECK(cuMulticastAddDevice((*comm)->mc_handle, (*comm)->mydev));
CUdeviceptr mc_va;
CUCHECK(cuMemAddressReserve(&mc_va, mc_maxsize, 0, 0U, 0));
CUCHECK(cuMemMap(mc_va, mc_maxsize, 0, (*comm)->mc_handle, 0));
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.location.id = (*comm)->mydev;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CUCHECK(cuMemSetAccess(mc_va, mc_maxsize, &accessDesc, 1));
(*comm)->mc_baseptr = reinterpret_cast<void *>(mc_va);
MPI_Barrier(MPI_COMM_WORLD);
if (!(*comm)->myrank)
printf("MC initialized succesfully, window size = %ld\n", mc_maxsize);
} else {
if (!(*comm)->myrank)
printf("MC NOT initialized and used\n");
(*comm)->mc_maxsize = 0;
(*comm)->mc_offset = 0;
(*comm)->use_mc = 0;
}
#define LOCALSIZE 4 * (NVTE_REG0_OFFSET(*comm) + NVTE_REG0_FLAGS + NVTE_REG0_COMMBUFFER * NBUF)
// peer pointers + op flags + comm buffer
......@@ -351,28 +437,134 @@ int register_user_buffer_collective(void **gpubuff, size_t bytes, communicator *
return -1;
int hndl = comm->free_region;
comm->peer_ptr[hndl] = reinterpret_cast<void **>(malloc(sizeof(void *) * (comm->nvsize)));
size_t aligned_size = bytes;
comm->memflags[hndl] = 0;
if (alloc) {
CUDACHECK(cudaMalloc(gpubuff, bytes));
int nranks = comm->nvsize; // total GPUs in NVLINK domain
int myrank = comm->nvrank;
void **remptrs = reinterpret_cast<void **>(malloc(nranks * sizeof(void *)));
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.location.id = comm->mydev;
prop.requestedHandleTypes =
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; // CU_MEM_HANDLE_TYPE_FABRIC;
size_t granularity = 0;
CUCHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
// MPI_Allreduce MAX of granularity check
aligned_size = (bytes + granularity - 1) / granularity * granularity;
if (comm->use_mc) {
CUmulticastObjectProp mcProp = {};
mcProp.numDevices = nranks;
mcProp.size = aligned_size;
mcProp.handleTypes = prop.requestedHandleTypes;
CUCHECK(cuMulticastGetGranularity(&granularity, &mcProp, CU_MULTICAST_GRANULARITY_MINIMUM));
aligned_size = (aligned_size + granularity - 1) / granularity * granularity;
}
prop.location.id = comm->mydev;
comm->uchandles[hndl] = reinterpret_cast<CUmemGenericAllocationHandle *>(
malloc(nranks * sizeof(CUmemGenericAllocationHandle)));
CUCHECK(cuMemCreate(&(comm->uchandles[hndl][myrank]), aligned_size, &prop, 0));
int *peerfd = reinterpret_cast<int *>(malloc(nranks * sizeof(int)));
CUCHECK(cuMemExportToShareableHandle(&peerfd[myrank], comm->uchandles[hndl][myrank],
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0 /*flags*/));
volatile uint32_t abortFlag = 0;
struct ncclIpcSocket ipcSock = {0};
uint64_t opId = 0xdeadcafebeef;
ncclResult_t ret = ncclSuccess;
NCCLCHECK(ncclIpcSocketInit(&ipcSock, myrank, (uint64_t)opId, &abortFlag));
for (int p = 1; p < nranks; p++) {
MPI_Barrier(comm->comm_intra);
NCCLCHECKGOTO(
ncclIpcSocketSendFd(&ipcSock, peerfd[myrank], (myrank + p) % nranks, (uint64_t)opId), ret,
error);
NCCLCHECKGOTO(ncclIpcSocketRecvFd(&ipcSock, &peerfd[(myrank + nranks - p) % nranks]), ret,
error);
}
error:
NCCLCHECK(ncclIpcSocketClose(&ipcSock));
for (int p = 0; p < nranks; p++) {
if (p != myrank)
CUCHECK(cuMemImportFromShareableHandle(&comm->uchandles[hndl][p],
reinterpret_cast<void *>(peerfd[p]),
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
close(peerfd[p]);
}
CUdeviceptr ptr;
CUCHECK(cuMemAddressReserve(&ptr, aligned_size * nranks, 0, 0, 0));
comm->ucbase_ptr[hndl] = reinterpret_cast<void *>(ptr);
CUmemAccessDesc accessDesc = {};
accessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
accessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
accessDesc.location.id = comm->mydev;
for (int i = 0; i < nranks; i++) {
CUCHECK(cuMemMap(ptr + (aligned_size * i), aligned_size, 0, comm->uchandles[hndl][i], 0));
remptrs[i] = reinterpret_cast<void *>(ptr + (aligned_size * i));
if (i == comm->nvrank) {
if (hndl)
*gpubuff = remptrs[i];
else
comm->gpu_ptrs = remptrs[i];
}
comm->peer_ptr[hndl][i] = remptrs[i];
}
CUCHECK(cuMemSetAccess(ptr, aligned_size * nranks, &accessDesc, 1));
if (hndl == 0)
CUDACHECK(cudaMemset(comm->gpu_ptrs, 0, aligned_size));
CUDACHECK(
cudaMemcpy((reinterpret_cast<char *>(comm->gpu_ptrs)) + (hndl * nranks * sizeof(void *)),
remptrs, nranks * sizeof(void *), cudaMemcpyHostToDevice));
free(remptrs);
free(peerfd);
comm->memflags[hndl] = UB_MEM_UC_CONTIG | UB_MEM_ALLOCATED;
if (comm->use_mc && comm->mc_maxsize >= comm->mc_offset + aligned_size) {
CUCHECK(cuMulticastBindMem(comm->mc_handle, comm->mc_offset, comm->uchandles[hndl][myrank],
0 /*memOffset*/, aligned_size, 0));
comm->memflags[hndl] |= UB_MEM_MC_CREATED;
comm->mc_ptr[hndl] = comm->mc_baseptr + comm->mc_offset;
comm->mc_offset += aligned_size;
} else if (!comm->myrank) {
printf("UB: warning region %d size %ld MB registered without MC access\n", hndl,
aligned_size / 1024 / 1024);
}
} else {
assert(comm->nvsize <= 8);
cudaIpcMemHandle_t *memhndl =
reinterpret_cast<cudaIpcMemHandle_t *>(malloc(sizeof(cudaIpcMemHandle_t) * (comm->nvsize)));
CUDACHECK(cudaIpcGetMemHandle(&memhndl[comm->nvrank], *gpubuff));
MPI_Allgather(&memhndl[comm->nvrank], sizeof(cudaIpcMemHandle_t), MPI_BYTE, memhndl,
sizeof(cudaIpcMemHandle_t), MPI_BYTE, comm->comm_intra);
for (int i = 0; i < comm->nvsize; i++)
if (i != comm->nvrank)
CUDACHECK(cudaIpcOpenMemHandle((void **)&(comm->peer_ptr[hndl][i]), // NOLINT(*)
memhndl[i], cudaIpcMemLazyEnablePeerAccess));
comm->peer_ptr[hndl][comm->nvrank] = *gpubuff;
CUDACHECK(cudaDeviceSynchronize());
CUDACHECK(cudaMemcpy(
reinterpret_cast<char *>(comm->gpu_ptrs) + (hndl * comm->nvsize * sizeof(void *)),
comm->peer_ptr[hndl], comm->nvsize * sizeof(void *), cudaMemcpyHostToDevice));
CUDACHECK(cudaDeviceSynchronize());
free(memhndl);
}
assert(comm->nvsize <= 8);
cudaIpcMemHandle_t *memhndl =
reinterpret_cast<cudaIpcMemHandle_t *>(malloc(sizeof(cudaIpcMemHandle_t) * (comm->nvsize)));
CUDACHECK(cudaIpcGetMemHandle(&memhndl[comm->nvrank], *gpubuff));
MPI_Allgather(&memhndl[comm->nvrank], sizeof(cudaIpcMemHandle_t), MPI_BYTE, memhndl,
sizeof(cudaIpcMemHandle_t), MPI_BYTE, comm->comm_intra);
for (int i = 0; i < comm->nvsize; i++)
if (i != comm->nvrank)
CUDACHECK(cudaIpcOpenMemHandle((void **)&(comm->peer_ptr[hndl][i]), // NOLINT(*)
memhndl[i], cudaIpcMemLazyEnablePeerAccess));
comm->peer_ptr[hndl][comm->nvrank] = *gpubuff;
CUDACHECK(cudaDeviceSynchronize());
CUDACHECK(
cudaMemcpy(reinterpret_cast<char *>(comm->gpu_ptrs) + (hndl * comm->nvsize * sizeof(void *)),
comm->peer_ptr[hndl], comm->nvsize * sizeof(void *), cudaMemcpyHostToDevice));
CUDACHECK(cudaDeviceSynchronize());
free(memhndl);
comm->mem_size[hndl] = aligned_size;
comm->mem_ptr[hndl] = *gpubuff;
......
......@@ -398,7 +398,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
*reduceidptr = reduce_id;
} // fp16 reduce-scatter kernel (out of place)
#if 0
#if __CUDA_ARCH__ >= 900
// All MC kernels here
template <int RANKS>
__global__ void __launch_bounds__(MAX_THREADS)
......@@ -2641,7 +2641,11 @@ int reducescatter2_userbuff_inplace_gpu(const int maxcredit, const int handler,
callranks2_block_rs(1) callranks2_block_rs(2) callranks2_block_rs(4) callranks2_block_rs(8)
} else {
SETUP_LAUNCH_CONFIG(sms, warps * 32, stream);
if (comm->use_mc && (comm->memflags[handler] & UB_MEM_MC_CREATED)) {
callranks_rsMC(2) callranks_rsMC(4) callranks_rsMC(8)
} else {
callranks_rs(2) callranks_rs(4) callranks_rs(8)
}
}
return sms;
}
......@@ -2860,7 +2864,11 @@ void allgather2_userbuff_inplace(const int handler, const int offset, const int
warps = ar_nvsize;
SETUP_LAUNCH_CONFIG(sms, warps * 32, stream);
if (comm->use_mc && (comm->memflags[handler] & UB_MEM_MC_CREATED)) {
callranks_agMC(2) callranks_agMC(4) callranks_agMC(8)
} else {
callranks_ag(2) callranks_ag(4) callranks_ag(8)
}
}
void allgather2_userbuff_inplace_sliced(const int handler, const int offset, const int elements,
......@@ -2896,7 +2904,11 @@ void reducescatter2_userbuff_inplace(const int handler, const int offset, const
warps = ar_nvsize;
SETUP_LAUNCH_CONFIG(sms, warps * 32, stream);
if (comm->use_mc && (comm->memflags[handler] & UB_MEM_MC_CREATED)) {
callranks_rsMC(2) callranks_rsMC(4) callranks_rsMC(8)
} else {
callranks_rs(2) callranks_rs(4) callranks_rs(8)
}
}
void reducescatter2_userbuff_stridedoutput(void *output, const int handler, const int offset,
const int rowelements, const int colelements,
......@@ -2919,7 +2931,11 @@ void reducescatter2_userbuff_stridedoutput(void *output, const int handler, cons
warps = ar_nvsize;
SETUP_LAUNCH_CONFIG(sms, warps * 32, stream);
if (comm->use_mc && (comm->memflags[handler] & UB_MEM_MC_CREATED)) {
callranks_rs_oopMC(2) callranks_rs_oopMC(4) callranks_rs_oopMC(8)
} else {
callranks_rs_oop(2) callranks_rs_oop(4) callranks_rs_oop(8)
}
}
void reducescatter2_userbuff(void *output, const int handler, const int offset, const int elements,
communicator *comm, cudaStream_t stream) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment