Commit 7dc4e964 authored by wanghan's avatar wanghan
Browse files

Initial commit: RCCL auto-tuning project

parents
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_BOOTSTRAP_H_
#define NCCL_BOOTSTRAP_H_
#include "nccl.h"
#include "comm.h"
struct ncclBootstrapHandle {
uint64_t magic;
union ncclSocketAddress addr;
};
static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
ncclResult_t bootstrapNetInit();
ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
ncclResult_t bootstrapClose(void* commState);
ncclResult_t bootstrapAbort(void* commState);
#endif
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CHANNEL_H_
#define NCCL_CHANNEL_H_
#include "comm.h"
ncclResult_t initChannel(struct ncclComm* comm, int channelid);
ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
int peerNode = comm->rankToNode[peer];
int peerIndex = comm->rankToLocalRank[peer];
int nsteps = comm->maxLocalRanks;
int rankIndex = comm->rankToLocalRank[comm->rank];
int step, delta;
if (coll == ncclFuncSend) {
step = (nsteps + peerIndex - rankIndex)%nsteps;
delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
} else if (coll == ncclFuncRecv) {
step = (nsteps + rankIndex - peerIndex)%nsteps;
delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
} else {
return ncclInternalError;
}
*channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
return ncclSuccess;
}
static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
//*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
*channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
return ncclSuccess;
}
static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
int base;
NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
return ncclSuccess;
}
#endif
/*************************************************************************
* Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CHECKS_H_
#define NCCL_CHECKS_H_
#include "debug.h"
// Check CUDA RT calls
#define CUDACHECK(cmd) do { \
cudaError_t err = cmd; \
if( err != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUDACHECKGOTO(cmd, RES, label) do { \
cudaError_t err = cmd; \
if( err != cudaSuccess ) { \
WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
RES = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
// Report failure but clear error and continue
#define CUDACHECKIGNORE(cmd) do { \
cudaError_t err = cmd; \
if( err != cudaSuccess ) { \
INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
(void) cudaGetLastError(); \
} \
} while(false)
#include <errno.h>
// Check system calls
#define SYSCHECK(call, name) do { \
int retval; \
SYSCHECKVAL(call, name, retval); \
} while (false)
#define SYSCHECKVAL(call, name, retval) do { \
SYSCHECKSYNC(call, name, retval); \
if (retval == -1) { \
WARN("Call to " name " failed : %s", strerror(errno)); \
return ncclSystemError; \
} \
} while (false)
#define SYSCHECKSYNC(call, name, retval) do { \
retval = call; \
if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
} else { \
break; \
} \
} while(true)
#define SYSCHECKGOTO(statement, RES, label) do { \
if ((statement) == -1) { \
/* Print the back trace*/ \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
#define NEQCHECK(statement, value) do { \
if ((statement) != value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
#define NEQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) != value) { \
/* Print the back trace*/ \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
#define EQCHECK(statement, value) do { \
if ((statement) == value) { \
/* Print the back trace*/ \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno)); \
return ncclSystemError; \
} \
} while (0);
#define EQCHECKGOTO(statement, value, RES, label) do { \
if ((statement) == value) { \
/* Print the back trace*/ \
RES = ncclSystemError; \
INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno)); \
goto label; \
} \
} while (0);
// Propagate errors up
#define NCCLCHECK(call) do { \
ncclResult_t RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
/* Print the back trace*/ \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
return RES; \
} \
} while (0);
#define NCCLCHECKGOTO(call, RES, label) do { \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
/* Print the back trace*/ \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
} while (0);
#define NCCLWAIT(call, cond, abortFlagPtr) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
ncclResult_t RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
return ncclInternalError; \
} \
if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
} while (!(cond));
#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
volatile uint32_t* tmpAbortFlag = (abortFlagPtr); \
RES = call; \
if (RES != ncclSuccess && RES != ncclInProgress) { \
if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES); \
goto label; \
} \
if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
} while (!(cond));
#define NCCLCHECKTHREAD(a, args) do { \
if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
return args; \
} \
} while(0)
#define CUDACHECKTHREAD(a) do { \
if ((a) != cudaSuccess) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
args->ret = ncclUnhandledCudaError; \
return args; \
} \
} while(0)
#endif
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef COLL_NET_H_
#define COLL_NET_H_
#include "nccl.h"
#include "nccl_net.h"
typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
// Translation to external API
static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
/* DMA-BUF support */
static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request) {
NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
#endif
/*************************************************************************
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COLLECTIVES_H_
#define NCCL_COLLECTIVES_H_
enum ncclDevRedOp_t {
ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin,
ncclDevPreMulSum, ncclDevSumPostDiv,
ncclNumDevRedOps
};
struct ncclDevRedOpFull {
ncclDevRedOp_t op;
bool scalarArgIsPtr;
uint64_t scalarArg;
};
#define FUNC_INDEX_P2P (ncclNumTypes+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumDevRedOps)
#define FUNC_INDEX_ALLTOALL_PIVOT (FUNC_INDEX_P2P+1)
#define FUNC_INDEX(func, devredop, ncclType, al, pr) ((((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
#define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \
ncclFunction_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_ONERANK_REDUCE_NAME(devredop, type) \
ncclFunction_OneRankReduce_##devredop##_##type
#define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
ncclKernel_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type) \
ncclKernelDebug_##func##_##algo##_##proto##_##devredop##_##type
#define NCCL_IMPL_NAME(func, algo, proto) \
nccl##func##algo##proto
/* Declare all collective operations */
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
#else
#define DECL5(func, algo, proto, devredop, type) \
extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
#endif
#define SINGLE_ARG(...) __VA_ARGS__
#define CONCAT(a,b) a##b
#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f))
#define MACRO_IF_0(t, f) f
#define MACRO_IF_1(t, f) t
#define DECL4(func, algo, devredop, type, undef) \
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL, devredop, type)) \
MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128, devredop, type))
#define DECL3(func, devredop, type, undef) \
DECL4(func, RING, devredop, type, undef) \
DECL4(func, TREE, devredop, type, undef) \
DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
DECL4(func, COLLNET_CHAIN, devredop, type, undef) \
DECL4(func, NVLS, devredop, type, undef) \
DECL4(func, NVLS_TREE, devredop, type, undef)
#if defined(RCCL_BFLOAT16)
#define DECL2(func, devredop, undefForFloat) \
DECL3(func, devredop, int8_t, /*undef=*/0) \
DECL3(func, devredop, uint8_t, /*undef=*/0) \
DECL3(func, devredop, int32_t, /*undef=*/0) \
DECL3(func, devredop, uint32_t, /*undef=*/0) \
DECL3(func, devredop, int64_t, /*undef=*/0) \
DECL3(func, devredop, uint64_t, /*undef=*/0) \
DECL3(func, devredop, half, /*undef=*/undefForFloat) \
DECL3(func, devredop, float, /*undef=*/undefForFloat) \
DECL3(func, devredop, double, /*undef=*/undefForFloat) \
DECL3(func, devredop, rccl_bfloat16, /*undef=*/undefForFloat)
#else
#define DECL2(func, devredop, undefForFloat) \
DECL3(func, devredop, int8_t, /*undef=*/0) \
DECL3(func, devredop, uint8_t, /*undef=*/0) \
DECL3(func, devredop, int32_t, /*undef=*/0) \
DECL3(func, devredop, uint32_t, /*undef=*/0) \
DECL3(func, devredop, int64_t, /*undef=*/0) \
DECL3(func, devredop, uint64_t, /*undef=*/0) \
DECL3(func, devredop, half, /*undef=*/undefForFloat) \
DECL3(func, devredop, float, /*undef=*/undefForFloat) \
DECL3(func, devredop, double, /*undef=*/undefForFloat)
#endif
#define DECL(func) \
DECL2(func, Sum, /*undefForFloat=*/0) \
DECL2(func, Prod, /*undefForFloat=*/0) \
DECL2(func, Min, /*undefForFloat=*/0) \
DECL2(func, Max, /*undefForFloat=*/0) \
DECL2(func, PreMulSum, /*undefForFloat=*/0) \
DECL2(func, SumPostDiv, /*undefForFloat=*/1)
DECL2(Broadcast, Sum, /*undefForFloat=*/0)
DECL(Reduce)
DECL2(AllGather, Sum, /*undefForFloat=*/0)
DECL(ReduceScatter)
DECL(AllReduce)
DECL5(SendRecv, RING, SIMPLE, Sum, int8_t)
DECL5(AllToAllPivot, RING, SIMPLE, Sum, int8_t)
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)();
#if defined(RCCL_BFLOAT16)
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16)();
#endif
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)();
extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
// CHUNKSIZE must be a multiple of SLICESIZE
#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
#define BROADCAST_SLICESTEPS 1
#define BROADCAST_CHUNKSTEPS 1
#define REDUCE_SLICESTEPS 1
#define REDUCE_CHUNKSTEPS 1
#define NCCL_MAX_SLICE_PER_CHUNK 2 // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
#define ALLTOALL_PIVOT_SLICESTEPS 2
#define ALLTOALL_PIVOT_CHUNKSTEPS 4
// We can't use the enum identifiers like ncclSum, ncclFloat, etc since this
// macro will be used in preprocessor conditionals where enums have no meaning.
#define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \
(((type==2 || type==3) && (red==0 || red==2 || red==3)) || \
((type==4 || type==5) && (red==0 || red==2 || red==3)) || \
((type==6 || type==9) && (red==0 || red==2 || red==3)) || \
(type==7 && red==0) || \
(type==8 && red==0))
#endif
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COMM_H_
#define NCCL_COMM_H_
#include "transport.h"
#include "p2p.h"
#include "collectives.h"
#include "proxy.h"
#include "strongstream.h"
#include <map>
#include <chrono>
#if defined (ENABLE_TIMELINE)
#include "timeline/timeline.h"
#endif
#ifdef HYGON_SDMA_FEATURE
#include "hsa_ext_amd.h"
#include "hsa_extra.h"
#define RCCL_SDMA_QUEUE_NUM 8
#define RCCL_SDMA_QUEUE_DEPTH 64*4096
#endif
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#define HIPRT_CB
#else
#if CUDART_VERSION < 9000
struct cudaLaunchParams {
void *func;
dim3 gridDim;
dim3 blockDim;
void **args;
size_t sharedMem;
cudaStream_t stream;
};
#endif
#endif
#define CACHE_LINE_SIZE 64
#define MEM_ALIGN 4096
#define CUDA_IPC_MIN 2097152UL
// Channels / LL tuning
#define NCCL_LL_THREAD_THRESHOLD 8
#define NCCL_LL128_THREAD_THRESHOLD 8
#define NCCL_SIMPLE_THREAD_THRESHOLD 64
struct ncclSendMem {
union {
struct {
uint64_t head;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
void* ptrExchange;
uint64_t redOpArgExchange[2];
char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)];
int offsFifo[NCCL_STEPS];
};
char pad3[MEM_ALIGN];
};
};
struct ncclRecvMem {
union {
struct {
uint64_t tail;
char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
int sizesFifo[NCCL_STEPS];
int offsFifo[NCCL_STEPS];
int flush; // For GDRCopy-based flush
};
char pad4[MEM_ALIGN];
};
};
enum helperThreadState {ThreadStart, ThreadStop};
#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
struct ncclGraphHelperResources {
ncclComm* comm;
pthread_mutex_t threadLock;
pthread_cond_t threadCond;
enum helperThreadState threadState;
void* ipcBases[NCCL_IPC_POOL_SIZE];
int ipcTail;
int ipcHead;
};
struct ncclUserRedOp {
int freeNext; // -1=allocated, otherwise index of next free entry in array
ncclDataType_t datatype;
ncclDevRedOpFull opFull;
};
struct ncclNodeRanks {
int localRanks;
int* localRankToRank;
};
struct ncclDestructor {
struct ncclDestructor* next;
void* obj;
ncclResult_t(*fn)(struct ncclDestructor* me);
};
struct ncclCommCallback {
struct ncclCommCallback* next;
ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
};
struct ncclSharedResources {
int refCount;
struct ncclComm* owner; /* comm which creates this shared res. */
struct ncclChannelPeer* peers[MAXCHANNELS];
struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
/* P2P operation counter, one per channel */
uint64_t p2pOpCount[MAXCHANNELS];
/* Collective operation counter */
uint64_t collOpCount;
int tpNRanks;
int tpNLocalRanks;
int tpNChannels;
int tpP2pNChannels;
int tpP2pChunkSize;
uint64_t magic;
// top parent rank to localRank translation table
int* tpRankToLocalRank;
// Internal streams
struct ncclStrongStream deviceStream, hostStream;
/* proxy related shared res */
struct ncclProxyState* proxyState;
};
struct ncclChannel {
struct ncclChannelPeer** peers;
struct ncclDevChannelPeer** devPeers;
struct ncclRing ring;
int* devRingUserRanks;
struct ncclTree tree;
struct ncclTree collnetChain;
struct ncclDirect collnetDirect;
struct ncclTree binTree;
struct ncclNvls nvls;
int id; // index of this channel
uint32_t workFifoSent; // last used work index+1
/* comm split sharable resources */
struct ncclChannelPeer* collnetPeers;
struct ncclDevChannelPeer* collnetDevPeers;
struct ncclChannelPeer* nvlsPeers;
struct ncclDevChannelPeer* nvlsDevPeers;
#ifdef HYGON_SDMA_FEATURE
struct sdmaQueueInfo sdmaQueue;
#endif
/* When using the mixedHylinkShm function, specify the channel transport types*/
int transportType;
};
struct ncclWorkList {
struct ncclWorkList* next;
struct ncclWork work;
};
struct ncclPointerList {
struct ncclPointerList* next;
void *ptr;
};
struct ncclKernelPlan {
// A kernel plan is also a callback that reclaims itself. Hence this must
// be the first member.
struct ncclCommCallback reclaimer;
struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
struct ncclComm* comm;
struct ncclKernelPlan* next;
bool persistent; // aka captured in a graph
bool kernelSpecialized;
void *kernelFn;
int channelUbound; // only channels c < channelUbound are present
int channelCount; // number of channels present
uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
int threadPerBlock;
// workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
struct ncclWork* workHead;
int collOpCount; // zero based for this plan
struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
struct Channel {
int nWork;
union {
int nWorkElem; // used for coll and reg coll
int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
};
size_t collBytes;
struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
} channels[MAXCHANNELS];
};
struct ncclComm {
ncclFunc_t func_coll;
#if defined (ENABLE_TIMELINE)
struct ncclInfo *info;
Timeline * timeline;
#endif
struct ncclMemoryStack memPermanent, memScoped;
// List of destructors to run when comm is destructed
struct ncclDestructor* destructorHead;
struct ncclSharedResources* sharedRes;
/* map to top parent ranks. */
int* topParentRanks;
int* topParentLocalRanks;
struct ncclChannel channels[MAXCHANNELS];
struct ncclPeerInfo* peerInfo;
struct ncclTopoSystem* topo;
#ifdef HYGON_SDMA_FEATURE
bool sdmaCopyEnabe;
bool sdmaCountEnabe;
bool validHsaAgent;
uint32_t sdmaMinCopySize;
hsa_agent_t hsaAgent;
hsa_sdma_group_queue_t sdmaGroupQueue;
#endif
ncclNet_t* ncclNet;
ncclCollNet_t* ncclCollNet;
void* bootstrap;
// Bitmasks for ncclTransportP2pSetup
uint64_t* connectSend;
uint64_t* connectRecv;
uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
uint64_t commHash;
int rank; // my rank in the communicator
int nRanks; // number of GPUs in communicator
int cudaDev; // my cuda device index
//int nvmlDev; // my nvml device index
int compCap; // compute capability of the GPU
int minCompCap, maxCompCap; // min/max compute capability in the communicator
int64_t busId; // my PCI bus ID in int format
cpu_set_t cpuAffinity; // CPU affinity of the GPU
int WarpSize;
int cudaArch; // matches __CUDA_ARCH__ of device
int node;
int nNodes;
int localRank;
int localRanks;
int maxLocalRanks;
int* rankToNode;
int* rankToLocalRank;
int* localRankToRank;
// localRanks and localRanktoRank for all nodes
struct ncclNodeRanks* nodeRanks;
bool checkPointers;
bool dmaBufSupport;
// Counter for tracking CUDA launches (P2P and collectives included)
uint64_t opCount;
// Channels for collectives
int nChannels;
int nvlsChannels;
int collNetChannels;
// Channels (per peer) for p2p
int p2pnChannels;
int p2pnChannelsPerPeer;
int p2pChannels[MAXCHANNELS];
// Channels for mixed
int mixedTransportType;
int nMixedHylinkChannels;
// Should this comm allocate LL buffers for network P2P connections?
bool allocP2pNetLLBuffers;
// Buffer sizes
int buffSizes[NCCL_NUM_PROTOCOLS];
int p2pChunkSize;
// Algorithm/Protocols thresholds
ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
/* This attribute can indicate the states of communicators and return code of
* asynchronous NCCL operations. */
ncclResult_t asyncResult;
// Flag to ask NCCL kernels to abort
volatile uint32_t *abortFlag;
volatile uint32_t *childAbortFlag;
uint32_t *abortFlagRefCount;
// Flags for enable P2P NET
uint32_t p2pNet;
uint32_t useIntraNet;
bool hasFineGrain;
// Device side of the communicator (for cudaFree's)
struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
// Operation pool.
int workFifoDepth; // size of workFifoHeap[], power of 2
struct ncclWork* workFifoHeap;
struct ncclWork* devWorkFifoHeap;
void* workFifoHeapGdrHandle;
// Work completion notificaion
uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
// Intra-process sync
struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
int intraRank;
int intraRanks;
uint32_t intraBarrierPhase;
char intraPad1[64 - sizeof(uint64_t)];
uint64_t intraBarrierCounter; // only used if this is intraComm0
char intraPad2[64 - sizeof(uint64_t)];
uint64_t intraBarrierGate; // only used if this is intraComm0
struct ncclProxyState* proxyState;
int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
// Whether this communicator uses collNet
int collNetSupport;
uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
int intraHighestTransportType;
int* collNetHeads;
int collNetHeadsNum;
/* sharable collNet proxy progress resource. */
struct ncclCollNetSharedRes* collNetSharedRes;
// NVLink SHARP (NVLS) support
int nvlsSupport;
/* sharable NVLS resource. */
struct ncclNvlsSharedRes* nvlsResources;
size_t channelSize; // User requested work size (bytes) for channel partitions
// pools backed by comm->memPermanent
struct ncclMemoryPool memPool_ncclProxyOp;
struct ncclMemoryPool memPool_ncclKernelPlan;
struct ncclMemoryPool memPool_ncclPointerList;
// Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
// this comm is not yet in a group.
struct ncclComm* groupNext;
// Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
struct ncclComm* preconnectNext;
int persistentRefs; // number of persistent plan-lists capturing this comm
struct ncclTasks tasks;
hipStream_t sideStream; // [RCCL] Cached non-captured stream
// user-created reduction ops
int userRedOpCapacity, userRedOpFreeHead;
ncclUserRedOp *userRedOps;
// Queue of things for the main thread to do
struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
// List of kernel plans built form tasks.
struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
// First of the unlaunched kernels in `planQueue`
struct ncclKernelPlan* unlaunchedPlansHead;
hipEvent_t doneEvent;
hipStream_t lastStream;
#ifdef ENABLE_COLLTRACE
struct ncclCollTrace* collTrace;
union ncclCollTraceTail *collTraceTail;
pthread_t collTraceThread;
volatile bool collTraceExit;
#endif
ncclConfig_t config;
// initState is to more conveniently reclaim resources when errors happen.
ncclResult_t initState;
// flag to indicate if ncclCommFinalize() is called
bool finalizeCalled;
// shared structures for finalization
int finalizeRankCnt;
// Whether this comm is compatible with MSCCL
bool mscclCompatible;
// Runtime tuner for algorithm and protocol selection
struct {
bool enabled; // Whether tuning is enabled
std::map<uint64_t, int>* workloadCache; // workload hash -> best config index
// Current testing configuration
int currentAlgo; // NCCL_ALGO_RING/TREE/COLLNET
int currentProto; // NCCL_PROTO_SIMPLE/LL/LL128
// Performance tracking
float bestTime;
int bestAlgo;
int bestProto;
// Search state
int searchStep; // Current search step
bool isSearching; // Whether currently searching
uint64_t currentWorkloadHash; // Current workload being tuned
} tuner;
};
enum ncclLaunchMode {
ncclLaunchModeInvalid=0,
ncclLaunchModeParallel,
ncclLaunchModeGroup
};
extern enum ncclLaunchMode ncclParamLaunchMode;
void ncclCommPushFree(struct ncclComm* comm, void* buf);
void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) {
ncclResult_t result = ncclSuccess;
struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome);
while (cb != nullptr) {
struct ncclCommCallback* next = cb->next;
ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb
if (res1 != ncclSuccess) result = res1;
cb = next;
}
NCCLCHECK(result);
return ncclSuccess;
}
inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
int phase = comm->intraBarrierPhase;
if (comm->intraRanks == 1) {
// Release everyone (just me).
comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
} else {
struct ncclComm* comm0 = comm->intraComm0;
uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
if (uint32_t(count) == uint32_t(comm->intraRanks)) {
// Reset.
__atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
// Release everyone.
__atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
}
}
}
// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x)
inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
struct ncclComm* comm0 = comm->intraComm0;
comm->intraBarrierPhase ^= 1;
uint32_t phase = comm->intraBarrierPhase;
uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
if ((gate & 1) != phase) {
uint64_t t0 = clockNano();
do {
// Spin vigorously for first 5us.
if (clockNano()-t0 >= 5*1000) sched_yield();
gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
} while ((gate & 1) != phase);
}
if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
return gate>>32;
}
// Scrambles the bits of non-builtin values of ncclRedOp_t according to the
// communicator memory address. Used to catch bugs so that integer handles
// associated with this communicator won't collide with handles of other
// communicatrs. This function is its own inverse.
static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
// Preserve the built-in values.
if(int(op) < int(ncclNumOps))
return op;
uint64_t h = reinterpret_cast<uint64_t>(comm);
h ^= h >> 32;
h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant
h >>= 32; // h is now an excellent 32-bit hash of the comm pointer
h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1
int op1 = int(h) ^ int(op);
// Since builtin values are preserved, we also have to preserve their preimage.
return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1);
}
ncclResult_t ncclCommEnsureReady(ncclComm_t comm);
ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState);
#endif
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CORE_H_
#define NCCL_CORE_H_
#include <pthread.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdint.h>
#include <algorithm> // For std::min/std::max
#include "nccl.h"
#ifdef PROFAPI
#define NCCL_API(ret, func, args...) \
__attribute__ ((visibility("default"))) \
__attribute__ ((alias(#func))) \
ret p##func (args); \
extern "C" \
__attribute__ ((visibility("default"))) \
__attribute__ ((weak)) \
ret func(args)
#else
#define NCCL_API(ret, func, args...) \
extern "C" \
__attribute__ ((visibility("default"))) \
ret func(args)
#endif // end PROFAPI
static __inline__ int ncclTypeSize(ncclDataType_t type) {
switch (type) {
case ncclInt8:
case ncclUint8:
return 1;
case ncclFloat16:
#if defined(RCCL_BFLOAT16)
case ncclBfloat16:
#endif
return 2;
case ncclInt32:
case ncclUint32:
case ncclFloat32:
return 4;
case ncclInt64:
case ncclUint64:
case ncclFloat64:
return 8;
default:
return -1;
}
}
#include "debug.h"
#include "checks.h"
#include "rocmwrap.h"
#include "alloc.h"
#include "utils.h"
#include "param.h"
#include "nvtx_stub.h"
#endif // end include guard
/*************************************************************************
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CPUSET_H_
#define NCCL_CPUSET_H_
// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
static int hexToInt(char c) {
int v = c - '0';
if (v < 0) return -1;
if (v > 9) v = 10 + c - 'a';
if ((v < 0) || (v > 15)) return -1;
return v;
}
#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
uint32_t cpumasks[CPU_SET_N_U32];
int m = CPU_SET_N_U32-1;
cpumasks[m] = 0;
for (int o=0; o<strlen(str); o++) {
char c = str[o];
if (c == ',') {
m--;
cpumasks[m] = 0;
} else {
int v = hexToInt(c);
if (v == -1) break;
cpumasks[m] <<= 4;
cpumasks[m] += v;
}
}
// Copy cpumasks to mask
for (int a=0; m<CPU_SET_N_U32; a++,m++) {
memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
}
return ncclSuccess;
}
static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
int c = 0;
uint8_t* m8 = (uint8_t*)mask;
for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
if (c == 0 && m8[o] == 0) continue;
sprintf(str+c, "%02x", m8[o]);
c+=2;
if (o && o%4 == 0) {
sprintf(str+c, ",");
c++;
}
}
str[c] = '\0';
return ncclSuccess;
}
#endif
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_CUDAWRAP_H_
#define NCCL_CUDAWRAP_H_
#include <cuda.h>
#include <cuda_runtime.h>
#include "checks.h"
// Is cuMem API usage enabled
extern int ncclCuMemEnable();
#if CUDART_VERSION >= 11030
#include <cudaTypedefs.h>
#else
typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
#endif
#define CUPFN(symbol) pfn_##symbol
// Check CUDA PFN driver calls
#define CUCHECK(cmd) do { \
CUresult err = pfn_##cmd; \
if( err != CUDA_SUCCESS ) { \
const char *errStr; \
(void) pfn_cuGetErrorString(err, &errStr); \
WARN("Cuda failure '%s'", errStr); \
return ncclUnhandledCudaError; \
} \
} while(false)
#define CUCHECKGOTO(cmd, res, label) do { \
CUresult err = pfn_##cmd; \
if( err != CUDA_SUCCESS ) { \
const char *errStr; \
(void) pfn_cuGetErrorString(err, &errStr); \
WARN("Cuda failure '%s'", errStr); \
res = ncclUnhandledCudaError; \
goto label; \
} \
} while(false)
// Report failure but clear error and continue
#define CUCHECKIGNORE(cmd) do { \
CUresult err = pfn_##cmd; \
if( err != CUDA_SUCCESS ) { \
const char *errStr; \
(void) pfn_cuGetErrorString(err, &errStr); \
INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr); \
} \
} while(false)
#define CUCHECKTHREAD(cmd, args) do { \
CUresult err = pfn_##cmd; \
if (err != CUDA_SUCCESS) { \
INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
args->ret = ncclUnhandledCudaError; \
return args; \
} \
} while(0)
#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
#if CUDART_VERSION >= 11030
/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
// cuMem API support
DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
#if CUDA_VERSION >= 11070
DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
#endif
#if CUDA_VERSION >= 12010
/* NVSwitch Multicast support */
DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
#endif
#endif
/* CUDA Driver functions loaded with dlsym() */
DECLARE_CUDA_PFN_EXTERN(cuInit, 2000);
DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020);
DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
ncclResult_t ncclCudaLibraryInit(void);
extern int ncclCudaDriverVersionCache;
extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
inline ncclResult_t ncclCudaDriverVersion(int* driver) {
int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
if (version == -1) {
CUDACHECK(cudaDriverGetVersion(&version));
__atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
}
*driver = version;
return ncclSuccess;
}
#endif
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_DEBUG_H_
#define NCCL_DEBUG_H_
#include "nccl_net.h"
#include <stdio.h>
#include <chrono>
#include <type_traits>
#include <limits.h>
#include <string.h>
#include <pthread.h>
// Conform to pthread and NVTX standard
#define NCCL_THREAD_NAMELEN 16
extern int ncclDebugLevel;
extern uint64_t ncclDebugMask;
extern pthread_mutex_t ncclDebugLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
// Let code temporarily downgrade WARN into INFO
extern thread_local int ncclDebugNoWarn;
extern char ncclLastError[];
#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
#ifdef ENABLE_TRACE
#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
extern std::chrono::steady_clock::time_point ncclEpoch;
#else
#define TRACE(...)
#endif
void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
#endif
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_DEVICE_H_
#define NCCL_DEVICE_H_
#include "nccl.h"
#include "rccl_bfloat16.h"
#include "align.h"
#if defined(ENABLE_NPKIT)
#include "npkit/npkit_struct.h"
#endif
#if defined (ENABLE_TIMELINE)
#include "timeline/timeline.h"
#endif
#include <stdint.h>
#ifdef HYGON_SDMA_FEATURE
#include "hsa_ext_amd.h"
#include "hsa_extra.h"
#define PRINT_ERR(...)
#define PRINT_INFO(...)
#define PRINT_INFOM(...)
#define PRINT_INFOT(tid, ...)
#define PRINT_DEBUG(...)
#else
#define PRINT_ERR(...)
#define PRINT_INFO(...)
#define PRINT_INFOM(...)
#define PRINT_INFOT(tid, ...)
#define PRINT_DEBUG(...)
#endif
#if defined(ENABLE_NPKIT) && defined(HYGON_SDMA_FEATURE)
#define NPKIT_SET_GPU_EVENT(event, size, cost) \
NpKit::CollectGpuEvent(event, size, cost, NPKIT_GET_GPU_TIMESTAMP(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm) \
NpKit::CollectGpuEvent(event, size, cost, tm, ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
#else
#define NPKIT_SET_GPU_EVENT(event, size, cost)
#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm)
#endif
#ifdef HYGON_SDMA_FEATURE
#define INIT_PRIMS_SDMA(prims, args) \
{ \
prims.useSdmaCopy = args->useSdma; \
prims.sdmaMinCopySize = ncclShmem.channel.sdmaQueue.minCopySize; \
prims.sdmaCountEnabe = ncclShmem.channel.sdmaQueue.copyCountEnabe; \
prims.sdmaCopyCount = 0; \
prims.allCopyCount = 0; \
}
#endif
#define NCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_TREE 0
#define NCCL_ALGO_RING 1
#define NCCL_ALGO_COLLNET_DIRECT 2
#define NCCL_ALGO_COLLNET_CHAIN 3
#define NCCL_ALGO_NVLS 4
#define NCCL_ALGO_NVLS_TREE 5
extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
#define NCCL_PROTO_LL 0
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2
extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
#define NCCL_MAX_OPS 2048
#define NCCL_STEPS 8
union ncclLLFifoLine {
/* Flags have to be *after* data, because otherwise, an incomplete receive
from the network may receive the flag but not the data.
Note this is assuming that either we receive contiguous chunks of data
(sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
struct {
uint32_t data1;
uint32_t flag1;
uint32_t data2;
uint32_t flag2;
};
uint64_t v[2];
int4 i4;
};
#define WARP_SIZE warpSize
#define MAXCHANNELS 32
#define NCCL_MAX_NTHREADS 256
#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
#define NCCL_LL_LINES_PER_THREAD 8
#ifdef TEST_LL_CLEANUP
#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
#define NCCL_LL_FLAG_MAX 0x100
#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
#else
#define NCCL_LL_CLEAN_MASK 0x7ffffff8
#define NCCL_LL_FLAG(a) ((uint32_t)(a))
#endif
// Make sure the clean mask will last for at least NCCL_NSTEPS
static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
#define NCCL_LL128_LINESIZE 64
#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
#define NCCL_LL128_MAX_NTHREADS 256
#define NCCL_LL128_ELEMS_PER_THREAD 28
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
#define NCCL_DIRECT_WRITE 0x01
#define NCCL_DIRECT_READ 0x02
#define NCCL_DIRECT_NIC 0x04
#define NCCL_IPC_WRITE 0x08
#define NCCL_IPC_READ 0x10
#define NCCL_NVLS_MIN_POLL 0x20
struct ncclConnInfo {
// Regular comm mechanism
char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
uint64_t *tail; // Local for recv, remote for send
uint64_t *head; // Local for send, remote for recv
int flags; // Direct communication / other flags
int shared; // Buffers are shared
void **ptrExchange; // Pointer exchange for direct communication
uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
int *sizesFifo; // Sizes fifo from GPU to proxy
int *offsFifo; // Buffer fifo from proxy to GPU
uint64_t step; // Keep where we are
uint64_t llLastCleaning;
// GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
// allows software to explicitly initiate a flush read to HDP memory. See more
// descriptions in primitives.h.
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint32_t* curr_hdp_reg; // Current GPU's HDP register
};
struct ncclProxyConnector {
int tpRank;
int tpLocalRank;
int sameProcess;
struct ncclProxyConnection* connection;
};
struct ncclConnector {
int connected;
struct ncclProxyConnector proxyConn;
struct ncclTransportComm* transportComm;
void* transportResources;
struct ncclConnInfo conn;
};
struct ncclRing {
// Shortcuts for userRanks[1] and userRanks[n-1]
int prev;
int next;
// Maps an internal nccl index to user-specified rank order. This is necessary
// since we need to know how the user expects data to be ordered across
// devices. Ordered from current device.
int* userRanks;
int index; // This rank's index in the ring
};
// The root of each tree only has one node down (+1 intra-node).
#define NCCL_MAX_TREE_ARITY_TOP 2
// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
#define NCCL_MAX_TREE_ARITY 3
struct ncclTree {
int depth;
int up;
int down[NCCL_MAX_TREE_ARITY];
};
#define NCCL_MAX_DIRECT_ARITY 7
struct ncclDirect {
int depth;
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int shift; // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
int up[NCCL_MAX_DIRECT_ARITY];
int down[NCCL_MAX_DIRECT_ARITY];
};
#define NCCL_CONN_IDX_P2P_NET 2
#define NCCL_MAX_NVLS_ARITY 8
#define NCCL_MAX_NVLS_TREE_ARITY 3
struct ncclNvls {
int out;
int nHeads; // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
int up[NCCL_MAX_NVLS_ARITY];
int down;
int treeUp;
int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
int node;
int nNodes;
};
#define NCCL_MAX_CONNS 3
struct ncclChannelPeer {
struct ncclConnector send[NCCL_MAX_CONNS];
struct ncclConnector recv[NCCL_MAX_CONNS];
int refCount;
};
struct ncclDevComm;
#pragma pack(push) /* push current alignment to stack */
#pragma pack(8) /* set alignment to 8 bytes boundary */
/* ncclWork is to be a power of two, currently 8x64 bytes, */
/* to make sure reads to host from the CUDA kernel are aligned. */
/* Make sure to adjust padding at the end of ncclWorkElem. */
#define NCCL_WORK_SIZE 256
enum ncclWorkType : uint8_t {
ncclWorkTypeUnused=0,
ncclWorkTypeColl=1,
ncclWorkTypeP2p=2,
ncclWorkTypeRegColl=3
};
enum ncclWorkP2PType : uint8_t {
ncclWorkP2pTypeUnused=0,
ncclWorkP2pTypeSend,
ncclWorkP2pTypeRecv
};
struct ncclWorkHeader {
union {
int32_t workNext; // when isLast=0: Offset from kernel argument workHead
uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
};
uint16_t funcIndex;
uint8_t isLast:1; // last work for this kernel
uint8_t inFifo:1; // is this work in the fifo
enum ncclWorkType type;
};
struct ncclWorkElem {
union {
uint8_t flagBits;
struct {
uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, nWarps:5;
};
};
uint8_t direct;
uint8_t bid;
uint8_t nChannels;
struct {
uint32_t root:28;
uint32_t useSdma:2;
uint32_t connIndex:2;
};
const void * sendbuff;
void * recvbuff;
size_t count;
union {
size_t lastChunkSize;
// Pivot A2A kernel computes chunk size itself.
// Instead, it needs the number of bidirectional rings.
size_t pivotA2ANumBiRings;
};
uint64_t redOpArg;
uint64_t opCount;
};
static_assert((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem) == 4, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 4");
#define NCCL_MAX_WORK_ELEMENTS 1
struct ncclWorkElemP2p {
struct {
int32_t peer:26;
uint32_t useSdma:2;
uint32_t connIndex:2;
int32_t proto:2;
};
union {
uint16_t flagBits;
struct {
enum ncclWorkP2PType p2pType:4;
uint16_t nWarps:4;
uint16_t warpStart:4;
uint16_t ngroups:4;
};
};
uint16_t opCount;
// Important not to use any fields with greater than 4-byte alignment since
// we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
// there were 8-byte fields.
//void* buff;
uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
//size_t count;
uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
int chunkSize;
};
static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) == 8, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 8");
#define NCCL_MAX_WORK_ELEMENTS_P2P 2
struct ncclWorkElemReg {
struct ncclWorkElem elem;
void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
};
#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 1");
// Number of named barriers supported by CUDA
#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE)
struct ncclWork {
struct ncclWorkHeader header;
union {
char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
};
};
static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
struct ncclDevChannelPeer {
// Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
// instead of the full ncclConnector.
struct ncclConnInfo send[NCCL_MAX_CONNS];
struct ncclConnInfo recv[NCCL_MAX_CONNS];
};
#pragma pack(pop) /* restore original alignment from stack */
#ifdef ENABLE_PROFILING
#define PROFILE_NUM_ITEMS 31
#define PROFILE_NUM_LAUNCHES 1024
struct ncclProf {
uint32_t count;
uint32_t seq; // only entry from first launch is used
struct {
uint64_t line:16;
uint64_t timeStamp:48;
} elem[PROFILE_NUM_ITEMS];
};
static_assert(sizeof(struct ncclProf) == 256, "ncclProf must have size of 256");
#endif
#ifdef ENABLE_COLLTRACE
typedef enum {
ncclCollTraceNotReady = 0,
ncclCollTraceKernelLaunchType = 1,
ncclCollTraceKernelEndType = 2,
ncclCollTraceCollLaunchType = 3,
ncclCollTraceAbortType = 4,
ncclCollTraceDataType = 5,
ncclCollTraceCollElemType = (1<<4),
ncclCollTraceP2pElemType = (1<<5),
} ncclCollTraceDataType_t;
struct ncclCollTrace {
uint8_t type;
uint8_t bid;
int16_t funcIndex;
uint32_t data_0;
uint64_t timeStamp;
union {
uint64_t opCount;
uint32_t p2pOpCount[2];
};
union {
uint64_t data_1;
struct {
uint8_t nWarps;
uint8_t bid;
uint8_t nChannels;
} coll;
struct {
int16_t peer;
uint8_t ngroups:4;
uint8_t connIndex:4;
uint8_t warpStart:4;
uint8_t nWarps:4;
} p2p[2];
};
};
static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
union ncclCollTraceTail{
uint32_t tail;
char padding[4096];
};
#define COLLTRACE_NUM_ITEMS 8192
#endif
#ifdef HYGON_SDMA_FEATURE
struct sdmaQueueInfo {
hsa_sdma_info_t *sdmaInfo;
uint32_t *pkgIndex;
uint32_t minCopySize;
uint32_t copyCountEnabe;
uint32_t sdmaDepth;
uint32_t *ptrSdmaCopyCount;
uint32_t *ptrAllCopyCount;
};
#endif
struct alignas(16) ncclDevChannel {
struct ncclDevChannelPeer** peers;
struct ncclRing ring;
struct ncclTree tree;
struct ncclTree collnetChain;
struct ncclDirect collnetDirect;
struct ncclTree binTree;
struct ncclNvls nvls;
uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
#ifdef HYGON_SDMA_FEATURE
union {
struct sdmaQueueInfo sdmaQueue;
uint32_t rvsd[12];
};
#endif
};
struct ncclDevComm {
int rank;
int nRanks;
int buffSizes[NCCL_NUM_PROTOCOLS];
// Operation list for aggregation
int workFifoDepth;
struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
// Flag to ask NCCL kernels to abort
volatile uint32_t* abortFlag;
// Channels, device side
struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
#if defined(ENABLE_NPKIT)
NpKitEventCollectContext* npKitEventCollectContexts;
#endif
#ifdef ENABLE_COLLTRACE
struct ncclCollTrace* collTrace;
union ncclCollTraceTail *collTraceTail;
pthread_t collTraceThread;
#endif
#ifdef ENABLE_PROFILING
struct ncclProf* devProf;
#endif
#if defined (ENABLE_TIMELINE)
TimelineGpuEventContext* gpuEventContext;
#endif
#if defined (ENABLE_NPKIT) || defined (ENABLE_TIMELINE)
uint64_t* cpuTimestamp;
#endif
#ifdef HYGON_SDMA_FEATURE
uint32_t sdmaPkgIndex[8];
uint32_t sdmaCopyCount[MAXCHANNELS];
uint32_t allCopyCount[MAXCHANNELS];
#endif
};
struct alignas(16) ncclDevCommAndChannels {
struct ncclDevComm comm;
struct ncclDevChannel channels[MAXCHANNELS];
};
#ifdef __CUDA_ARCH__
#define NCCL_CUDA_ARCH __CUDA_ARCH__
#else
#define NCCL_CUDA_ARCH 0
#endif
template<typename T>
__host__ __device__ constexpr T min_constexpr(T a) { return a; }
template<typename T, typename ...Ts>
__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) {
return min_constexpr<T>((a < b ? a : b), c...);
}
template<typename T>
__host__ __device__ constexpr T max_constexpr(T a) { return a; }
template<typename T, typename ...Ts>
__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
return max_constexpr<T>((a > b ? a : b), c...);
}
// Calculate the unroll factor given:
// * bytePerPack: number of bytes accessed per instruction
// * insns: max permissible unroll value
// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack);
}
// Note that all unroll value logic should depend on a given cudaArch argument
// and not __CUDA_ARCH__ since these need to be host-side executable where the
// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device
// side code can elide passing the arch for brevity.
__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
// Our collective unroll should move to the same bytes&insns model as NVLS.
return cudaArch >= 800 ? 8 : 4;
}
__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; }
__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) {
return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch));
}
// The amount of dynamic shmem per warp
__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
return (max_constexpr<int>(
/*LL */0,
/*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
/*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16,
// NVLS needs an extra 16B to read unaligned data.
/*NVLS */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
) + 15) & -16; // pad to 16 bytes
}
// The amount of dynamic shmem per block
__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
}
#endif
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_ENQUEUE_H_
#define NCCL_ENQUEUE_H_
#include "comm.h"
#include "group.h"
#include "collectives.h"
#include "utils.h"
#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
#endif // End include guard
/*************************************************************************
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_GDRWRAP_H_
#define NCCL_GDRWRAP_H_
#include "nccl.h"
#include <stdint.h> // for standard [u]intX_t types
#include <stdio.h>
#include <stdlib.h>
// These can be used if the GDR library isn't thread safe
#include <pthread.h>
extern pthread_mutex_t gdrLock;
#define GDRLOCK() pthread_mutex_lock(&gdrLock)
#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
#define GDRLOCKCALL(cmd, ret) do { \
GDRLOCK(); \
ret = cmd; \
GDRUNLOCK(); \
} while(false)
#define GDRCHECK(cmd) do { \
int e; \
/* GDRLOCKCALL(cmd, e); */ \
e = cmd; \
if( e != 0 ) { \
WARN("GDRCOPY failure %d", e); \
return ncclSystemError; \
} \
} while(false)
// This is required as the GDR memory is mapped WC
#if !defined(__NVCC__)
#if defined(__PPC__)
static inline void wc_store_fence(void) { asm volatile("sync") ; }
#elif defined(__x86_64__)
#include <immintrin.h>
static inline void wc_store_fence(void) { _mm_sfence(); }
#elif defined(__aarch64__)
#ifdef __cplusplus
#include <atomic>
static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
#else
#include <stdatomic.h>
static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
#endif
#endif
#endif
//#define GDR_DIRECT 1
#ifdef GDR_DIRECT
// Call the GDR API library code directly rather than via
// dlopen() wrappers
#include <gdrapi.h>
static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
GDRCHECK(gdr_unpin_buffer(g, handle));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
GDRCHECK(gdr_get_info(g, handle, info));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
return ncclSuccess;
}
static void wrap_gdr_runtime_get_version(int *major, int *minor) {
gdr_runtime_get_version(major, minor);
return ncclSuccess;
}
static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
gdr_driver_get_version(g, major, minor);
return ncclSuccess;
}
static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
return ncclSuccess;
}
static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
return ncclSuccess;
}
#else
// Dynamically handle dependency the GDR API library
/* Extracted from gdrapi.h (v2.1 Nov 2020) */
#define GPU_PAGE_SHIFT 16
#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT)
#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1)
#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET)
struct gdr;
typedef struct gdr *gdr_t;
typedef struct gdr_mh_s {
unsigned long h;
} gdr_mh_t;
struct gdr_info {
uint64_t va;
uint64_t mapped_size;
uint32_t page_size;
uint64_t tm_cycles;
uint32_t cycles_per_ms;
unsigned mapped:1;
unsigned wc_mapping:1;
};
typedef struct gdr_info gdr_info_t;
/* End of gdrapi.h */
ncclResult_t wrap_gdr_symbols(void);
gdr_t wrap_gdr_open(void);
ncclResult_t wrap_gdr_close(gdr_t g);
ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
#endif // GDR_DIRECT
// Global GDR driver handle
extern gdr_t ncclGdrCopy;
#include "alloc.h"
typedef struct gdr_mem_desc {
void *gdrDevMem;
void *gdrMap;
size_t gdrOffset;
size_t gdrMapSize;
gdr_mh_t gdrMh;
} gdr_mem_desc_t;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
static gdr_t ncclGdrInit() {
INFO(NCCL_INIT, "Enabled GDRCopy equivalent memory allocation");
return (gdr_t)0x12345678L;
}
template <typename T>
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) {
gdr_info_t info;
size_t mapSize;
gdr_mh_t mh;
char *devMem;
void *gdrMap;
mapSize = sizeof(T)*nelem;
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, true));
gdr_mem_desc_t* md;
NCCLCHECK(ncclCalloc(&md, 1));
md->gdrDevMem = devMem;
md->gdrMap = NULL;
md->gdrMapSize = mapSize;
md->gdrOffset = 0;
md->gdrMh.h = 0;
*gdrHandle = md;
*ptr = (T *)(devMem);
if (devPtr) *devPtr = (T *)(devMem);
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
memcpy(dst, src, nelem*sizeof(T));
return ncclSuccess;
}
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
CUDACHECK(hipFree(md->gdrDevMem));
free(md);
return ncclSuccess;
}
#else
static gdr_t ncclGdrInit() {
int libMajor, libMinor, drvMajor, drvMinor;
gdr_t handle = NULL;
// Dynamically load the GDRAPI library symbols
if (wrap_gdr_symbols() == ncclSuccess) {
handle = wrap_gdr_open();
if (handle != NULL) {
ncclResult_t res;
// Query the version of libgdrapi
NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
// Query the version of gdrdrv driver
NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
// Only support GDRAPI 2.1 and later
if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
goto error;
}
else
INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
}
}
return handle;
error:
if (handle != NULL) (void) wrap_gdr_close(handle);
return NULL;
}
template <typename T>
static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
gdr_info_t info;
size_t mapSize;
gdr_mh_t mh;
char *devMem;
void *gdrMap;
mapSize = sizeof(T)*nelem;
// GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
// GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
size_t align = alignedAddr - (uint64_t)devMem;
//TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
//TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
// Will offset ever be non zero ?
ssize_t off = info.va - alignedAddr;
gdr_mem_desc_t* md;
NCCLCHECK(ncclCalloc(&md, 1));
md->gdrDevMem = devMem;
md->gdrMap = gdrMap;
md->gdrMapSize = mapSize;
md->gdrOffset = off+align;
md->gdrMh = mh;
*gdrHandle = md;
*ptr = (T *)((char *)gdrMap+off);
if (devPtr) *devPtr = (T *)(devMem+off+align);
TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
return ncclSuccess;
}
template <typename T>
static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
return ncclSuccess;
}
static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
NCCLCHECK(ncclCudaFree(md->gdrDevMem));
free(md);
return ncclSuccess;
}
#endif
#endif // End include guard
/*************************************************************************
* Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef RCCL_GIT_VERSION_H_
#define RCCL_GIT_VERSION_H_
extern const char *rcclGitHash;
#endif
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_GRAPH_H_
#define NCCL_GRAPH_H_
#include "nccl.h"
#include "devcomm.h"
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
#include <stdio.h>
#include <sched.h>
ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
struct ncclTopoSystem;
// Build the topology
ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
void ncclTopoFree(struct ncclTopoSystem* system);
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
// Query topology
ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
#define MAX_XGMI_INTER_GPUS 4
ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
int ncclPxnDisable(struct ncclComm* comm);
ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
// Find CPU affinity
ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
#define NCCL_TOPO_CPU_ARCH_X86 1
#define NCCL_TOPO_CPU_ARCH_POWER 2
#define NCCL_TOPO_CPU_ARCH_ARM 3
#define NCCL_TOPO_CPU_VENDOR_INTEL 1
#define NCCL_TOPO_CPU_VENDOR_AMD 2
#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
#define NCCL_TOPO_CPU_TYPE_BDW 1
#define NCCL_TOPO_CPU_TYPE_SKL 2
#define NCCL_TOPO_CPU_TYPE_ZEN 3
#define NCCL_TOPO_CPU_TYPE_ROME 4
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
#define NCCL_TOPO_MAX_NODES 256
// Init search. Needs to be done before calling ncclTopoCompute
ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
#define NCCL_TOPO_PATTERN_BALANCED_TREE 1 // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
#define NCCL_TOPO_PATTERN_SPLIT_TREE 2 // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
#define NCCL_TOPO_PATTERN_TREE 3 // All NIC traffic going to/from the same GPU
#define NCCL_TOPO_PATTERN_RING 4 // Ring
#define NCCL_TOPO_PATTERN_NVLS 5 // NVLS+SHARP and NVLS+Tree
struct ncclTopoGraph {
// Input / output
int id; // ring : 0, tree : 1, collnet : 2
int pattern;
int crossNic;
int collNet;
int minChannels;
int maxChannels;
// Output
int nChannels;
float bwIntra;
float bwInter;
float latencyInter;
int typeIntra;
int typeInter;
int sameChannels;
int nHops;
int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
int inter[MAXCHANNELS*2];
int nIntraChannels;
int intraNets[MAXCHANNELS*NCCL_TOPO_MAX_NODES*2];
char treeBase[NCCL_TOPO_MAX_NODES][NCCL_TOPO_MAX_NODES*4];
};
ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);
struct ncclTopoRanks {
int ringRecv[MAXCHANNELS];
int ringSend[MAXCHANNELS];
int ringPrev[MAXCHANNELS];
int ringNext[MAXCHANNELS];
int treeToParent[MAXCHANNELS];
int treeToChild0[MAXCHANNELS];
int treeToChild1[MAXCHANNELS];
int nvlsHeads[MAXCHANNELS];
};
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc);
ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
#include "info.h"
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
#endif
/*************************************************************************
* Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_GROUP_H_
#define NCCL_GROUP_H_
#include "nccl.h"
#include "comm.h"
ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
void ncclGroupCommJoin(struct ncclComm* comm);
void ncclGroupCommPreconnect(struct ncclComm* comm);
ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
void ncclGroupJobAbort();
typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
typedef enum ncclGroupJobState {
ncclGroupJobRunning = 0,
ncclGroupJobDone = 1,
ncclGroupJobJoined = 2,
} ncclGroupJobState_t;
struct ncclAsyncJob {
struct ncclAsyncJob* next;
pthread_t thread;
ncclResult_t result;
ncclResult_t(*func)(struct ncclAsyncJob*);
void(*undo)(struct ncclAsyncJob*);
void(*destructor)(void*);
ncclGroupJobState_t state;
volatile uint32_t *abortFlag; /* point to comm abortFlag */
volatile uint32_t *childAbortFlag; /* point to child abortFlag */
ncclComm_t comm;
};
ncclResult_t ncclAsyncLaunch(
struct ncclAsyncJob* job,
ncclResult_t(*func)(struct ncclAsyncJob*),
void(*undo)(struct ncclAsyncJob*),
void(*destructor)(void*), ncclComm_t comm
);
struct ncclGroupJob {
struct ncclAsyncJob base;
struct ncclComm **groupCommHeadPtr;
struct ncclComm **groupCommPreconnectHeadPtr;
ncclResult_t *groupErrorPtr;
volatile bool *abortFlagPtr;
struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
bool doneFlag;
};
ncclResult_t ncclGroupStartInternal();
ncclResult_t ncclGroupEndInternal();
ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
////////////////////////////////////////////////////////////////////////////////
extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
extern __thread ncclResult_t ncclGroupError;
extern __thread struct ncclComm* ncclGroupCommHead;
extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
extern __thread int ncclGroupBlocking;
extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
extern __thread struct ncclGroupJob ncclGroupJobMain;
static inline void groupResetJobState() {
ncclGroupBlocking = -1;
ncclGroupJobMainPtr = NULL;
memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
return;
}
static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
ncclResult_t ret = ncclSuccess;
if (job) {
ret = ncclAsyncJobComplete(&job->base);
groupResetJobState();
}
return ret;
}
inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
if (ncclGroupDepth > 0) {
if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret;
}
return ret;
}
// Add comm to this thread's group
inline void ncclGroupCommJoin(struct ncclComm* comm) {
if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
// Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
// the users program order yet insures siblings occur consecutively. This
// is required by doLaunches() in "group.cc".
struct ncclComm** pp = &ncclGroupCommHead;
while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
pp = &(*pp)->groupNext;
comm->groupNext = *pp;
*pp = comm;
// Comms gets a new memory stack scope upon joining. Each task batched for
// this comm is allocated there.
ncclMemoryStackPush(&comm->memScoped);
}
ncclGroupBlocking = comm->config.blocking;
}
// Add comm to this thread's group needing preconnect
inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
comm->preconnectNext = ncclGroupCommPreconnectHead;
ncclGroupCommPreconnectHead = comm;
}
}
// Comm has left group
inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
ncclMemoryStackPop(&comm->memScoped);
return ncclSuccess;
}
#endif
/* Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef HIP_SRC_HIP_PROF_RCCL_H
#define HIP_SRC_HIP_PROF_RCCL_H
#include "hipprof/hip_prof_rccl_str.h"
#include "hipprof/hip_profile_common.h"
typedef prof_error_t (*PFN_rccl_prof_api_enter)(uint32_t cid, void *api_entry);
typedef prof_error_t (*PFN_rccl_prof_api_exit)(uint32_t cid, void *api_entry);
extern PFN_rccl_prof_api_enter pfn_rccl_prof_api_enter;
extern PFN_rccl_prof_api_exit pfn_rccl_prof_api_exit;
void init_rccl_prof_fns();
#endif // HIP_SRC_HIP_PROF_RCCL_H
/* Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. */
#ifndef HIP_SRC_HIP_PROF_RCCL_API_H
#define HIP_SRC_HIP_PROF_RCCL_API_H
#include <atomic>
#include <cassert>
#include <iostream>
#include <shared_mutex>
#include <utility>
#include "info.h"
#include "debug.h"
#include <dlfcn.h>
#include <sys/utsname.h>
#include <fstream>
#include <sys/shm.h>
#include "hipprof/hip_prof_rccl.h"
#include "hipprof/hip_prof_rccl_param.h"
#define RCCL_CB_SPAWNER_OBJECT(operation_id, info_ptr) rccl_cb_spawner_object<RCCL_API_ID_##operation_id> __api_tracer(info_ptr);
template <rccl_api_id_t operation_id> class rccl_cb_spawner_object {
public:
rccl_cb_spawner_object(ncclInfo* info_ptr): info(info_ptr), entry(nullptr), stat(STATUS_ERROR), correlation_id(0) {
if (ncclParamHipProf() == 1) {
static_assert(operation_id >= RCCL_API_ID_FIRST && operation_id <= RCCL_API_ID_LAST, "invalid RCCL_API operation id");
init_rccl_prof_fns();
entry = std::make_unique<hip_prof_rccl_entry>();
entry->kind = RCCL_KIND_ID_API;
entry->ret_stat = 0;
entry->cid = operation_id;
entry->sendbuff = info->sendbuff;
entry->recvbuff = info->recvbuff;
entry->count = info->count;
entry->datatype = info->datatype;
entry->op = info->op;
entry->rid = info->root;
if (pfn_rccl_prof_api_enter != nullptr) {
stat = pfn_rccl_prof_api_enter(operation_id, entry.get());
if (stat != STATUS_SUCCESS) {
INFO(NCCL_INIT, "stat: %d, Failed to add rccl_prof_api_enter.", stat);
entry.reset();
} else {
correlation_id = entry->correlation_id;
}
}
}
}
activity_correlation_id_t getCorrelationId() const {
return correlation_id;
}
~rccl_cb_spawner_object() {
if (stat == STATUS_SUCCESS && entry) {
entry->nBytes = info->nBytes;
if (pfn_rccl_prof_api_exit != nullptr) {
stat = pfn_rccl_prof_api_exit(operation_id, entry.get());
if (stat != STATUS_SUCCESS) {
INFO(NCCL_INIT, "Failed to add rccl_prof_api_exit.");
}
}
correlation_id = 0;
}
}
private:
prof_error_t stat;
std::unique_ptr<hip_prof_rccl_entry> entry;
activity_correlation_id_t correlation_id;
ncclInfo* info;
};
#endif // HIP_SRC_HIP_PROF_RCCL_API_H
\ No newline at end of file
#ifndef HIP_SRC_HIP_PROF_RCCL_PARAM_H
#define HIP_SRC_HIP_PROF_RCCL_PARAM_H
#include <stdint.h>
void ncclLoadHipProfParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
int64_t ncclParamHipProf();
#endif // HIP_SRC_HIP_PROF_RCCL_PARAM_H
\ No newline at end of file
// Generated file. DO NOT EDIT.
//
// This file is automatically generated by the rccl_gen.py script.
// If changes are required, run the script and commit the updated file.
#ifndef _HIP_PROF_RCCL_STR_H
#define _HIP_PROF_RCCL_STR_H
#define HIP_PROF_RCCL_VER 1
// RCCL API callbacks ID enumeration
enum rccl_api_id_t {
RCCL_API_ID_NONE = 0,
RCCL_API_ID_FIRST = 1,
RCCL_API_ID_mscclLoadAlgo = 1,
RCCL_API_ID_mscclRunAlgo = 2,
RCCL_API_ID_mscclUnloadAlgo = 3,
RCCL_API_ID_ncclAllGather = 4,
RCCL_API_ID_ncclAllReduce = 5,
RCCL_API_ID_ncclAllToAll = 6,
RCCL_API_ID_ncclAllToAllv = 7,
RCCL_API_ID_ncclBcast = 8,
RCCL_API_ID_ncclBroadcast = 9,
RCCL_API_ID_ncclGather = 10,
RCCL_API_ID_ncclRecv = 11,
RCCL_API_ID_ncclReduce = 12,
RCCL_API_ID_ncclReduceScatter = 13,
RCCL_API_ID_ncclScatter = 14,
RCCL_API_ID_ncclSend = 15,
RCCL_API_ID_LAST = 15,
};
// Return the RCCL API string for a given callback ID
static inline const char* rccl_api_name(const uint32_t id) {
switch(id) {
case RCCL_API_ID_mscclLoadAlgo: return "mscclLoadAlgo";
case RCCL_API_ID_mscclRunAlgo: return "mscclRunAlgo";
case RCCL_API_ID_mscclUnloadAlgo: return "mscclUnloadAlgo";
case RCCL_API_ID_ncclAllGather: return "ncclAllGather";
case RCCL_API_ID_ncclAllReduce: return "ncclAllReduce";
case RCCL_API_ID_ncclAllToAll: return "ncclAllToAll";
case RCCL_API_ID_ncclAllToAllv: return "ncclAllToAllv";
case RCCL_API_ID_ncclBcast: return "ncclBcast";
case RCCL_API_ID_ncclBroadcast: return "ncclBroadcast";
case RCCL_API_ID_ncclGather: return "ncclGather";
case RCCL_API_ID_ncclRecv: return "ncclRecv";
case RCCL_API_ID_ncclReduce: return "ncclReduce";
case RCCL_API_ID_ncclReduceScatter: return "ncclReduceScatter";
case RCCL_API_ID_ncclScatter: return "ncclScatter";
case RCCL_API_ID_ncclSend: return "ncclSend";
};
return "unknown";
}
#include <string.h>
// Return the RCCL API callback ID for a given name
static inline uint32_t rcclApiIdByName(const char* name) {
if (strcmp("mscclLoadAlgo", name) == 0) return RCCL_API_ID_mscclLoadAlgo;
if (strcmp("mscclRunAlgo", name) == 0) return RCCL_API_ID_mscclRunAlgo;
if (strcmp("mscclUnloadAlgo", name) == 0) return RCCL_API_ID_mscclUnloadAlgo;
if (strcmp("ncclAllGather", name) == 0) return RCCL_API_ID_ncclAllGather;
if (strcmp("ncclAllReduce", name) == 0) return RCCL_API_ID_ncclAllReduce;
if (strcmp("ncclAllToAll", name) == 0) return RCCL_API_ID_ncclAllToAll;
if (strcmp("ncclAllToAllv", name) == 0) return RCCL_API_ID_ncclAllToAllv;
if (strcmp("ncclBcast", name) == 0) return RCCL_API_ID_ncclBcast;
if (strcmp("ncclBroadcast", name) == 0) return RCCL_API_ID_ncclBroadcast;
if (strcmp("ncclGather", name) == 0) return RCCL_API_ID_ncclGather;
if (strcmp("ncclRecv", name) == 0) return RCCL_API_ID_ncclRecv;
if (strcmp("ncclReduce", name) == 0) return RCCL_API_ID_ncclReduce;
if (strcmp("ncclReduceScatter", name) == 0) return RCCL_API_ID_ncclReduceScatter;
if (strcmp("ncclScatter", name) == 0) return RCCL_API_ID_ncclScatter;
if (strcmp("ncclSend", name) == 0) return RCCL_API_ID_ncclSend;
return RCCL_API_ID_NONE;
}
#endif // _HIP_PROF_RCCL_STR_H
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment