Commit 7dc4e964 authored by wanghan's avatar wanghan
Browse files

Initial commit: RCCL auto-tuning project

parents
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "msccl/msccl_lifecycle.h"
#include "hipprof/hip_prof_rccl_api.h"
NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) {
// Just pass the size of one message and not the total bytes sent/received.
constexpr nvtxPayloadSchemaEntry_t AllGatherSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"}
};
size_t msgsize = sendcount * ncclTypeSize(datatype);
NVTX3_FUNC_WITH_PARAMS(AllGather, AllGatherSchema, msgsize)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
sendcount, datatype, 0, 0, ncclSum, mscclFuncAllGather, comm, stream);
}
struct ncclInfo info = { ncclFuncAllGather, "AllGather",
sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */
ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS };
RCCL_CB_SPAWNER_OBJECT(ncclAllGather, &info);
return ncclEnqueueCheck(&info);
}
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "nccl.h"
#include "msccl/msccl_lifecycle.h"
#include "hipprof/hip_prof_rccl_api.h"
NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsAllReduce {
size_t bytes;
ncclRedOp_t op;
};
// Just pass the size of one message and not the total bytes sent/received.
static constexpr nvtxPayloadSchemaEntry_t AllReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsAllReduce, op)}
};
NvtxParamsAllReduce payload{count * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(AllReduce, AllReduceSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
count, datatype, 0, 0, op, mscclFuncAllReduce, comm, stream);
}
struct ncclInfo info = { ncclFuncAllReduce, "AllReduce",
sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */
ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS };
RCCL_CB_SPAWNER_OBJECT(ncclAllReduce, &info);
return ncclEnqueueCheck(&info);
}
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "graph/topo.h"
#include "msccl/msccl_lifecycle.h"
NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclComm_t comm, hipStream_t stream) {
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
count, datatype, 0, 0, ncclSum, mscclFuncAllToAll, comm, stream);
}
size_t rankOffset = count * ncclTypeSize(datatype);
size_t rankAlign = rankOffset & ((~rankOffset) + 1);
// Determine Pivot A2A support now that we know number of channels
if (comm->topo->pivotA2AEnabled && comm->nChannels >= comm->topo->pivotA2ANumBiRings * 2 &&
rankOffset >= 744 * 1024 && rankAlign != 4) {
struct ncclInfo info = { ncclFuncAllToAllPivot, "AllToAllPivot",
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
ALLTOALL_PIVOT_CHUNKSTEPS, ALLTOALL_PIVOT_SLICESTEPS };
return ncclEnqueueCheck(&info);
} else {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
if (count == 0) return ncclSuccess;
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, datatype, r, comm, stream));
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, datatype, r, comm, stream));
}
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
}
}
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "msccl/msccl_lifecycle.h"
NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, sendcounts, sdispls, recvbuff, recvcounts, rdispls,
0, datatype, 0, 0, ncclSum, mscclFuncAllToAllv, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nRanks; r++) {
if (sendcounts[r]) NCCLCHECK(ncclSend(
((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
sendcounts[r],
datatype,
r,
comm,
stream));
if (recvcounts[r]) NCCLCHECK(ncclRecv(
((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
recvcounts[r],
datatype,
r,
comm,
stream));
}
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
}
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "msccl/msccl_lifecycle.h"
#include "hipprof/hip_prof_rccl_api.h"
NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsBroadcast {
size_t bytes;
int root;
};
constexpr nvtxPayloadSchemaEntry_t BroadcastSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsBroadcast, root)}
};
NvtxParamsBroadcast payload{count * ncclTypeSize(datatype), root};
NVTX3_FUNC_WITH_PARAMS(Broadcast, BroadcastSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
count, datatype, root, 0, ncclSum, mscclFuncBroadcast, comm, stream);
}
struct ncclInfo info = { ncclFuncBroadcast, "Broadcast",
sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */
BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS };
RCCL_CB_SPAWNER_OBJECT(ncclBroadcast, &info);
return ncclEnqueueCheck(&info);
}
/* Deprecated original "in place" function, similar to MPI */
NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, cudaStream_t stream) {
return ncclBroadcast(buff, buff, count, datatype, root, comm, stream);
}
#
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../../makefiles/common.mk
include ../../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../../build)
OBJDIR := $(BUILDDIR)/obj/collectives/device
LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu sendrecv.cu onerank_reduce.cu
LIBSRCFILES += functions.cu
DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES))
DEPENDFILES:= $(DEPFILES:%.d=%.dep)
STATICLIB := $(OBJDIR)/colldevice.a
DEVOBJ := $(OBJDIR)/devlink.o
RULESFILE := $(OBJDIR)/Makefile.rules
NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden"
all: $(STATICLIB)
# Dummy rule so that the extra dependency (%.dep) files are preserved by make
all_deps: $(DEPENDFILES)
# Auto-generating the rules per op/reduction/datatype/algorithm
$(RULESFILE) : gen_rules.sh
@printf "Generating %-35s > %s\n" rules $@
@mkdir -p $(OBJDIR)
@CUDA_MAJOR=${CUDA_MAJOR} CUDA_MINOR=${CUDA_MINOR} ./gen_rules.sh $(OBJDIR) > $@
-include $(RULESFILE)
LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o $(OBJDIR)/onerank_reduce.o
-include $(DEPFILES)
$(STATICLIB): $(LIBOBJ) $(DEVOBJ)
@printf "Archiving %-35s > %s\n" objects $@
ar cr $@ $^
# We do not want make to build *.d when running make clean.
# So we only provide targets for .dep which will produce .dep and .d,
# with only .d being included, and .dep keeping track of what needs to
# be regenerated.
$(OBJDIR)/%.dep : %.cu
@mkdir -p $(OBJDIR)
@$(NVCC) $(NVCUFLAGS) -M $< -o $@.tmp
@sed "0,/^.*:/s//$(subst /,\/,$@):/" $@.tmp > $@
@sed -e 's/.*://' -e 's/\\$$//' < $@.tmp | fmt -1 | \
sed -e 's/^ *//' -e 's/$$/:/' >> $@
@rm -f $@.tmp
@cp $@ $(@:.dep=.d)
# Compiled kernels and collectives with relocatable device code ...
$(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) $(NVCUFLAGS) -dc $< -o $@
$(OBJDIR)/onerank_reduce.o : onerank_reduce.cu $(OBJDIR)/onerank_reduce.dep
@printf "Compiling %-35s > %s\n" $< $@
mkdir -p `dirname $@`
$(NVCC) $(NVCUFLAGS) -dc $< -o $@
# ... and create the device-side linked object with all those.
$(DEVOBJ) : $(LIBOBJ)
$(NVCC) $(NVCUFLAGS) -dlink $^ -o $@
clean:
rm -f $(LIBOBJ) $(DEVOBJ) $(DEPFILES) $(DEPENDFILES) $(RULESFILE) $(STATICLIB)
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "all_gather.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_C(AllGather);
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "primitives.h"
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
#endif
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
const int *ringRanks = ring->userRanks;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLGATHER_CHUNKSTEPS : 1));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*int(chunkSize);
const ssize_t size = args->count;
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_ENTRY, nranks*size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined (ENABLE_TIMELINE)
int elems = 0, totalElems = 0;
uint64_t clkStamp = 0ULL;
struct ncclDevComm* comm = &ncclShmem.comm;
uint64_t entryStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_ALLGATHER_ENTRY, 0, entryStamp, comm->cpuTimestamp);
#endif
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, args->connIndex, args->connIndex);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = ring->index;
INIT_PRIMS_SDMA(prims, args);
#endif
#if defined(ENABLE_NPKIT)
if (tid == 0) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t realChunkSize;
if (Proto::Id == NCCL_PROTO_SIMPLE) {
realChunkSize = min(chunkSize, divUp(size-gridOffset,nChannels));
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
}
else if (Proto::Id == NCCL_PROTO_LL)
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
else if (Proto::Id == NCCL_PROTO_LL128)
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
realChunkSize = int(realChunkSize);
ssize_t chunkOffset = gridOffset + int(bid*realChunkSize);
/////////////// begin AllGather steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ringRanks[0];
offset = chunkOffset + rankDest * size;
PRINT_DEBUG("kallgather ringIx:%d bid:%d size:%d gridOffset:%d nChannels:%d nranks:%d chunkSize:%d loopSize:%d "
"realChunkSize:%d offset:%d size-offset:%d nelem:%d sdma:%d sizeT:%d wptr:%d dep:%d\n",
ring->index, bid, size*sizeof(T), gridOffset*sizeof(T), nChannels, nranks, chunkSize*sizeof(T), loopSize*sizeof(T),
realChunkSize*sizeof(T), offset*sizeof(T), (size-offset)*sizeof(T), nelem*sizeof(T),
args->useSdma, sizeof(T), *ncclShmem.channel.sdmaQueue->wptr, ncclShmem.channel.sdmaQueue->dep_signal);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
if (inputBuf + chunkOffset == outputBuf + offset) { // In place
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directSend(chunkOffset, offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
} else {
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_COPY_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directCopySend(chunkOffset, offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_COPY_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
rankDest = ringRanks[nranks-j];
offset = chunkOffset + rankDest * size;
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecvCopySend(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
// Make final copy from buffer to dest.
rankDest = ringRanks[1];
offset = chunkOffset + rankDest * size;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
// Final wait/copy.
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecv(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_GATHER_RING_EXIT, nranks*size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined (ENABLE_TIMELINE)
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_ALLGATHER_EXIT, totalElems*sizeof(T), __builtin_amdgcn_s_memrealtime() - entryStamp, comm->cpuTimestamp);
#endif
}
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<ALLGATHER_CHUNKSTEPS/ALLGATHER_SLICESTEPS, ALLGATHER_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllGather, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*chunkSize;
const int nThreadsGather = 128;
const int nThreadsBcast = 384 + WARP_SIZE;
const int tidEndGather = nThreadsGather;
const int tidEndBcast = tidEndGather + nThreadsBcast;
using Proto = ProtoSimple<1, 1>;
if (tid < tidEndGather) {
// Gather
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.gather(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndBcast) {
// Bcast through NVLS
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsBcast, NULL, &nvls->down, args->sendbuff, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.send(offset, nelem);
}
}
}
};
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
/*This file is now generated in CMake*/
// #include "all_reduce.h"
// #include "common.h"
// #include "collectives.h"
// IMPL_COLL_R(AllReduce);
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "primitives.h"
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
#endif
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
#endif
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
int ringIx = ring->index;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLREDUCE_CHUNKSTEPS : 1));
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*nranks*chunkSize;
const ssize_t size = args->count;
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined (ENABLE_TIMELINE)
int elems = 0, totalElems = 0;
uint64_t clkStamp = 0ULL;
struct ncclDevComm* comm = &ncclShmem.comm;
uint64_t entryStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_ALLREDUCE_ENTRY, 0, entryStamp, comm->cpuTimestamp);
#endif
int minChunkSize;
if (Proto::Id == NCCL_PROTO_LL)
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T));
if (Proto::Id == NCCL_PROTO_LL128) {
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2;
}
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = ring->index;
INIT_PRIMS_SDMA(prims, args);
#endif
#if defined(ENABLE_NPKIT)
if (tid == 0) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t realChunkSize;
if (Proto::Id == NCCL_PROTO_SIMPLE) {
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks));
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
}
else
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize);
realChunkSize = int(realChunkSize);
auto calcOffset = [&]__device__(int chunk)->ssize_t {
if (Proto::Id == NCCL_PROTO_SIMPLE)
return gridOffset + bid*nranks*realChunkSize + chunk*realChunkSize;
else
return gridOffset + (chunk*nChannels + bid)*realChunkSize;
};
auto modRanks = [&]__device__(int r)->int {
return r - (r >= nranks ? nranks : 0);
};
ssize_t offset;
int nelem;
int chunk;
// step 0: push data to next GPU
chunk = modRanks(ringIx + nranks-1);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
PRINT_DEBUG("kallreduce ringIx:%d bid:%d size:%d gridOffset:%d nChannels:%d nranks:%d chunkSize:%d loopSize:%d "
"realChunkSize:%d chunk:%d offset:%d size-offset:%d nelem:%d sdma:%d sizeT:%d\n",
ringIx, bid, size*sizeof(T), gridOffset*sizeof(T), nChannels, nranks, chunkSize*sizeof(T), loopSize*sizeof(T),
realChunkSize*sizeof(T), chunk, offset*sizeof(T), (size-offset)*sizeof(T), nelem*sizeof(T),
args->useSdma, sizeof(T));
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.send(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
// k-2 steps: reduce and copy to next GPU
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
for (int j=2; j<nranks; ++j) {
chunk = modRanks(ringIx + nranks-j);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.recvReduceSend(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
// step k-1: reduce this buffer and data, which will produce the final
// result that we store in this data and push to the next GPU
chunk = ringIx + 0;
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecvReduceCopySend(offset, offset, nelem, /*postOp=*/true);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_REDUCE_COPY_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY, nelem*(nranks-2)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
// k-2 steps: copy to next GPU
for (int j=1; j<nranks-1; ++j) {
chunk = modRanks(ringIx + nranks-j);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecvCopySend(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT)
if (tid == 0 && nranks > 2) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT, nelem*(nranks-2)*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
// Make final copy from buffer to dest.
chunk = modRanks(ringIx + 1);
offset = calcOffset(chunk);
nelem = min(realChunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecv(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT, nelem*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
#if defined (ENABLE_TIMELINE)
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_ALLREDUCE_EXIT, totalElems*sizeof(T), __builtin_amdgcn_s_memrealtime() - entryStamp, comm->cpuTimestamp);
// Timeline::CollectGpuEvent(comm->gpuEventContext, TIMELINE_EVENT_COLL_EXIT, args->op.opCount, comm->cpuTimestamp);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_RING_EXIT, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#ifdef HYGON_SDMA_FEATURE
if (tid == 0 && prims.useSdmaCopy && prims.sdmaCountEnabe) {
*ncclShmem.channel.sdmaQueue.ptrSdmaCopyCount += prims.sdmaCopyCount;
*ncclShmem.channel.sdmaQueue.ptrAllCopyCount += prims.allCopyCount;
PRINT_DEBUG("allreduce ringIx:%d bid:%d sdmaCopyCount:%d allCopyCount:%d sumSdma:%d sumAll:%d\n",
ringIx, (int)blockIdx.x, prims.sdmaCopyCount, prims.allCopyCount,
*ncclShmem.channel.sdmaQueue.ptrSdmaCopyCount,
*ncclShmem.channel.sdmaQueue.ptrAllCopyCount);
}
#endif
}
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runTreeUpDown(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) {
#endif
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclTree *tree = &ncclShmem.channel.tree;
ssize_t chunkSize = int(
Proto::Id == NCCL_PROTO_SIMPLE ? args->lastChunkSize
/* LL & LL128 */ : Proto::calcBytePerStep()/sizeof(T));
const ssize_t minChunkSize = int(
Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T))
/* LL & LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
const ssize_t loopSize = int(nChannels*chunkSize);
const ssize_t size = args->count;
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined (ENABLE_TIMELINE)
int elems = 0;
size_t totalElems = 0;
uint64_t clkStamp = 0ULL;
struct ncclDevComm* comm = &ncclShmem.comm;
uint64_t entryStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_ALLREDUCE_ENTRY, 0, entryStamp, comm->cpuTimestamp);
#endif
if (loopSize > size)
chunkSize = divUp((int)size, int(nChannels*minChunkSize))*int(minChunkSize);
{ // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0> prims
(tid, nthreads, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = tree->up;
INIT_PRIMS_SDMA(prims, args);
#endif
#if defined(ENABLE_NPKIT)
if (tid == 0) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
if (tree->up == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_COPY_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_COPY_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
else if (tree->down[0] == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.send(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.recvReduceSend(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
{ // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0> prims
(tid, nthreads, &tree->up, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = tree->up;
INIT_PRIMS_SDMA(prims, args);
#endif
#if defined(ENABLE_NPKIT)
if (tid == 0) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
if (tree->up == -1) {
#ifdef HYGON_SDMA_FEATURE
// output buffer maybe cached mem, need flush cache or set HSA_DISABLE_CACHE=1
prims.useSdmaCopy = 0;
#endif
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_SEND_FROM_OUTPUT_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directSendFromOutput(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_SEND_FROM_OUTPUT_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
else if (tree->down[0] == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecv(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecvCopySend(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
#if defined (ENABLE_TIMELINE)
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_ALLREDUCE_EXIT, totalElems*sizeof(T), __builtin_amdgcn_s_memrealtime() - entryStamp, comm->cpuTimestamp);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runTreeSplit(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) {
#endif
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclTree *tree = &ncclShmem.channel.tree;
ssize_t chunkSize = int(
Proto::Id != NCCL_PROTO_LL ? args->lastChunkSize
: Proto::calcBytePerStep()/sizeof(T));
const ssize_t minChunkSize = int(
Proto::Id == NCCL_PROTO_SIMPLE ? nthreads*8*(sizeof(uint64_t)/sizeof(T)) :
Proto::Id == NCCL_PROTO_LL ? nthreads*(Proto::calcBytePerGrain()/sizeof(T))
/* LL128 */ : nthreads*(Proto::calcBytePerGrain()/sizeof(T))/8);
const ssize_t loopSize = int(nChannels*chunkSize);
const ssize_t size = args->count;
int nthreadsSplit;
if (Proto::Id == NCCL_PROTO_SIMPLE) {
nthreadsSplit = nthreads/2;
if (nthreadsSplit >= 256) nthreadsSplit += 64;
} else { // LL & LL128
// Receiving from up to 3 sources is more compute intensive than sending
// to 3 dests. Use 70% for reduce and 30% for bcast.
nthreadsSplit = (nthreads*7/(10*WARP_SIZE))*WARP_SIZE;
}
#if defined(ENABLE_NPKIT)
bool isNpKitThread = false;
int npKitCtxIdx = 0;
if (threadIdx.x == 0) {
isNpKitThread = true;
npKitCtxIdx = bid * 2;
} else if (tree->up != -1 && threadIdx.x == nthreadsSplit) {
isNpKitThread = true;
npKitCtxIdx = bid * 2 + 1;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (isNpKitThread) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined (ENABLE_TIMELINE)
int elems = 0;
size_t totalElems = 0;
uint64_t clkStamp = 0ULL;
struct ncclDevComm* comm = &ncclShmem.comm;
uint64_t entryStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_ALLREDUCE_ENTRY, 0, entryStamp, comm->cpuTimestamp);
#endif
if (loopSize > size)
chunkSize = divUp((int)size, nChannels*int(minChunkSize))*int(minChunkSize);
if (tree->up == -1) {
// Reduce and broadcast. Max number of recv is 2, max number of send is 2
Primitives<T, RedOp, FanSymmetric<NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nthreads, tree->down, tree->down, args->sendbuff, args->recvbuff, args->redOpArg);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = tree->up;
INIT_PRIMS_SDMA(prims, args);
#endif
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecvReduceCopySend(offset, offset, nelem, /*doPost=*/true);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
else if (tid < nthreadsSplit) {
/* Reduce up. Max number of recv is 3, max number of send is 1 (binary tree + local).
* Why Direct=1????
* Answer: Because despite not performing any direct operations, the ctor
* must assume Direct so that it can exchange direct pointers with remote ctors
* that are Direct, otherwise it hangs. A cleaner solution would be to seperate
* into DirectRecv and DirectSend capabilities, this ctor would have both=0,
* but the ctor above for tree roots would be DirectRecv=0 DirectSend=1.
*/
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DEV_ARITY, 1>, /*Direct=*/0, Proto, 0>
prims(tid, nthreadsSplit, tree->down, &tree->up, args->sendbuff, args->recvbuff, args->redOpArg, 0*Proto::MaxGroupWidth);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = tree->up;
INIT_PRIMS_SDMA(prims, args);
#endif
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
if (tree->down[0] == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.send(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.recvReduceSend(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
else {
// Broadcast down. Max number of recv is 1, max number of send is 3 (binary tree + local)
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DEV_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &tree->up, tree->down, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = tree->up;
INIT_PRIMS_SDMA(prims, args);
#endif
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
if (tree->down[0] == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecv(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.directRecvCopySend(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_DIRECT_RECV_COPY_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT, size*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
#if defined (ENABLE_TIMELINE)
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_ALLREDUCE_EXIT, totalElems*sizeof(T), __builtin_amdgcn_s_memrealtime() - entryStamp, comm->cpuTimestamp);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<ALLREDUCE_CHUNKSTEPS/ALLREDUCE_SLICESTEPS, ALLREDUCE_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runTreeUpDown<T, RedOp, ProtoSimple<1, 1>>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
static constexpr int COLLNET_COPY_THREADS = 64;
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclDirect* direct = &ncclShmem.channel.collnetDirect;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*direct->nHeads*chunkSize;
const int hasUp = (direct->up[0] >= 0) ? 1 : 0;
const int hasDn = (direct->down[0] >= 0) ? 1 : 0;
const int nThreadsScatter = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0);
const int nThreadsGather = ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0);
const int nThreadsBcast = WARP_SIZE + ((hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 1*COLLNET_COPY_THREADS);
const int nThreadsReduce = args->nWarps*WARP_SIZE - nThreadsScatter - nThreadsGather - nThreadsBcast;
const int tidStartBcast = nThreadsGather;
const int tidStartScatter = tidStartBcast + nThreadsBcast;
const int tidStartReduce = tidStartScatter + nThreadsScatter;
using Proto = ProtoSimple<1, 1>;
if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) {
// Scatter
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartScatter, nThreadsScatter, NULL, direct->up, args->sendbuff, args->recvbuff,
args->redOpArg, 2*Proto::MaxGroupWidth, 1, 1, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
if (args->regUsed) {
prims.directScatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
} else {
prims.scatter(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
}
}
} else if (tid >= tidStartReduce && direct->out != -1) {
if (hasDn) {
// Reduce, send to network
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, direct->down, &direct->out, args->sendbuff, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
if (args->regUsed) {
prims.directRecvReduceSend(offset, nelem);
} else {
prims.recvReduceSend(offset, nelem);
}
}
} else {
// Directly send to network
Primitives<T, RedOp, FanAsymmetric<0, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartReduce, nThreadsReduce, nullptr, &direct->out, args->sendbuff, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.send(offset, nelem);
}
}
} else if (tid < tidStartBcast && hasUp) {
// Gather
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_DIRECT_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsGather, direct->up, NULL, args->sendbuff, args->recvbuff,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*direct->nHeads*chunkSize;
int nelem = min(direct->nHeads*chunkSize, size-offset);
prims.directGather(offset, nelem, chunkSize, chunkSize, direct->headRank, direct->shift);
}
} else if (tid >= tidStartBcast && tid < tidStartScatter && direct->out != -1) {
if (hasDn) {
// Recv from network, broadcast
Primitives<T, RedOp, FanAsymmetric<1, NCCL_MAX_DIRECT_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, direct->down, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0, args);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvCopyDirectSend(offset, nelem, /*postOp=*/true);
}
} else {
// Recv from network (no post thread needed)
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidStartBcast, nThreadsBcast, &direct->out, nullptr, args->sendbuff, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*direct->nHeads+direct->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recv(offset, nelem, /*postOp=*/true);
}
}
}
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
#if NCCL_NVLS_ENABLED
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const bool hasOut = nvls->out != -1;
const int reduceWarps = hasOut ? 3 : nranks <= 6 ? 7 : 5;
const int bcastWarps = hasOut ? 2 : 0;
const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
const int nThreadsReduce = reduceWarps*WARP_SIZE;
const int nThreadsBcast = (bcastWarps)*WARP_SIZE;
const int tidEndScatter = nThreadsScatter;
const int tidEndGather = tidEndScatter + nThreadsGather;
const int tidEndReduce = tidEndGather + nThreadsReduce;
const int tidEndBcast = tidEndReduce + nThreadsBcast;
if (tid < tidEndScatter) {
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce && nvls->headRank != -1) {
if (!hasOut) {
// Reduce, broadcast through NVLS
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->down, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, &nvls->down, &nvls->out, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndReduce, nThreadsBcast, &nvls->out, &nvls->down, NULL, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
#endif // NCCL_NVLS_ENABLED
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_NVLS_TREE, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
#if NCCL_NVLS_ENABLED
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const int treeUp = nvls->treeUp;
const int* treeDown = nvls->treeDown;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*nvls->nHeads*chunkSize;
const int nranks = ncclShmem.comm.nRanks;
const bool hasUp = treeUp != -1;
const int reduceWarps = hasUp ? 5 : nranks <= 6 ? 7 : 5;
const int bcastWarps = hasUp ? 4 : 0;
const int scatterWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps + 1)/2;
const int gatherWarps = ((NCCL_MAX_NTHREADS/WARP_SIZE) - reduceWarps - bcastWarps)/2;
const int nThreadsScatter = scatterWarps*WARP_SIZE;
const int nThreadsGather = gatherWarps*WARP_SIZE;
const int nThreadsReduce = reduceWarps*WARP_SIZE;
const int nThreadsBcast = (bcastWarps)*WARP_SIZE;
const int tidEndScatter = nThreadsScatter;
const int tidEndGather = tidEndScatter + nThreadsGather;
const int tidEndReduce = tidEndGather + nThreadsReduce;
const int tidEndBcast = tidEndReduce + nThreadsBcast;
if (tid < tidEndScatter) {
// Scatter
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.scatter(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndGather) {
// Gather
using Proto = ProtoSimple<1, 1, COLL_UNROLL>;
Primitives<T, RedOp, FanAsymmetric<NCCL_MAX_NVLS_ARITY, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsGather, nvls->up, NULL, NULL, args->recvbuff,
args->redOpArg, 1*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*nvls->nHeads*chunkSize;
int nelem = min(nvls->nHeads*chunkSize, size-offset);
prims.gather(offset, nelem, chunkSize, chunkSize, -1, 0);
}
} else if (tid < tidEndReduce && nvls->headRank != -1) {
if (!hasUp) {
// Reduce and Broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 1>;
Primitives<T, RedOp, FanSymmetric<3>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, treeDown, treeDown, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
} else {
// Reduce, send to network
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 1, 0>;
Primitives<T, RedOp, FanAsymmetric<3, 1>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndGather, nThreadsReduce, treeDown, &treeUp, NULL, NULL,
args->redOpArg, 2*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
} else if (tid < tidEndBcast && nvls->headRank != -1) {
// Recv from network, broadcast
using Proto = ProtoSimple<1, 1, COLL_UNROLL, 0, 1>;
Primitives<T, RedOp, FanAsymmetric<1, 3>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndReduce, nThreadsBcast, &treeUp, treeDown, NULL, NULL,
args->redOpArg, 3*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + (bid*nvls->nHeads+nvls->headRank)*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recvSend(nelem);
}
}
#endif // NCCL_NVLS_ENABLED
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclTree *tree = &ncclShmem.channel.collnetChain;
ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t loopSize = int(nChannels*chunkSize);
const int nranks = ncclShmem.comm.nRanks;
const ssize_t size = args->count;
int nthreadsSplit = nthreads/2;
if (nthreadsSplit >= 256) nthreadsSplit += 64;
int group, connIndex, send, recv, groupTid, groupNthreads;
using Proto = ProtoSimple<1, 1>;
if (tid < nthreadsSplit) {
// Reduce up the chain
group = 0;
connIndex = 1;
recv = tree->down[0];
send = tree->up;
groupTid = tid;
groupNthreads = nthreadsSplit;
} else {
// Broadcast down the chain
group = 1;
connIndex = 0;
recv = tree->up;
send = tree->down[0];
groupTid = tid - nthreadsSplit;
groupNthreads = nthreads-nthreadsSplit;
}
Primitives<T, RedOp, FanSymmetric<1>, /*Direct=*/1, Proto, 0>
prims(groupTid, groupNthreads, &recv, &send, args->sendbuff, args->recvbuff,
args->redOpArg, group*Proto::MaxGroupWidth, connIndex, connIndex);
if (tid < nthreadsSplit) {
if (recv == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.send(offset, nelem);
}
} else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.recvReduceSend(offset, nelem);
}
}
}
else {
if (recv == nranks) {
// I'm the first in the broadcast chain, I need to perform the division (postOp)
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.recv(offset, nelem, /*postOp*/true);
}
} else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.recvCopyDirectSend(offset, nelem, /*postOp*/true);
}
}
} else {
if (send == -1) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecv(offset, nelem);
}
} else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*int(chunkSize);
int nelem = min(chunkSize, size-offset);
prims.directRecvCopySend(offset, nelem);
}
}
}
}
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runTreeSplit<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
//LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllReduce, T, RedOp, NCCL_ALGO_TREE, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runTreeSplit<T, RedOp, ProtoLL128>(args);
//LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, RedOp, T, args);
}
};
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "alltoall_pivot.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_F(AllToAllPivot);
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "primitives.h"
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
#endif
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nranks = ncclShmem.comm.nRanks;
const ncclRing *ring = &ncclShmem.channel.ring;
const int num_bi_rings = args->pivotA2ANumBiRings;
const int num_uni_rings = num_bi_rings * 2;
const int num_chunks = args->nChannels / 2;
const int chunk_id = (bid % num_bi_rings) + (bid / num_uni_rings * num_bi_rings);
const int elem_size = min(256, args->count & (~(args->count) + 1));
const ssize_t num_elems = args->count / elem_size;
const int num_padding_chunks = num_elems % num_chunks;
const ssize_t chunk_offset = elem_size * (num_elems / num_chunks * chunk_id + (chunk_id < num_padding_chunks ? chunk_id : num_padding_chunks));
const ssize_t chunk_size = elem_size * (num_elems / num_chunks + (chunk_id < num_padding_chunks ? 1 : 0));
const int pivot_direction = (bid % num_uni_rings) / num_bi_rings;
const ssize_t prims_size = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? ALLTOALL_PIVOT_CHUNKSTEPS : 1));
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0> prims
(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, /*redOpArg(ignored)=*/0);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = ring->index;
INIT_PRIMS_SDMA(prims, args);
#endif
for (int num_hops = 0; num_hops <= nranks / 2; num_hops++) {
const int src_rank = ring->userRanks[(nranks - num_hops) % nranks];
const int dst_rank = ring->userRanks[num_hops];
const ssize_t send_offset =
dst_rank * num_elems * elem_size + chunk_offset +
(src_rank == dst_rank ? pivot_direction * chunk_size / 2 : 0);
const ssize_t recv_offset =
src_rank * num_elems * elem_size + chunk_offset +
(src_rank == dst_rank ? pivot_direction * chunk_size / 2 : 0);
const ssize_t send_recv_size =
src_rank == dst_rank ?
(pivot_direction == 0 ? chunk_size / 2 : chunk_size - chunk_size / 2) : chunk_size;
if (num_hops == 0 && args->sendbuff != args->recvbuff) {
const T* sendbuff = (const T*)args->sendbuff + send_offset;
T* recvbuff = (T *)args->recvbuff + recv_offset;
reduceCopy<COLL_UNROLL, RedOp, T, 0,1, 1, 0, 1, 1, 0>(
tid, nthreads, 0, nullptr, false, 1, (void **)&sendbuff, 1, (void **)&recvbuff, send_recv_size);
} else {
for (ssize_t prims_offset = 0; prims_offset < send_recv_size; prims_offset += prims_size) {
const int prims_nelem = min(prims_size, send_recv_size - prims_offset);
// step 0: send
prims.send(send_offset + prims_offset, prims_nelem);
// num_hops - 1 steps: recv and copy to next gpu
for (int i = 0; i < num_hops - 1; i++) {
prims.recvSend(prims_nelem);
}
// final step: recv
prims.directRecv(recv_offset + prims_offset, prims_nelem);
}
}
}
}
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncAllToAllPivot, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<ALLTOALL_PIVOT_CHUNKSTEPS/ALLTOALL_PIVOT_SLICESTEPS, ALLTOALL_PIVOT_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
};
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "broadcast.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_C(Broadcast);
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "primitives.h"
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
#endif
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? BROADCAST_CHUNKSTEPS : 1));
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->count;
const int rank = ring->userRanks[0];
const int nextRank = ring->userRanks[1];
const int root = args->root;
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined (ENABLE_TIMELINE)
int elems = 0, totalElems = 0;
uint64_t clkStamp = 0ULL;
struct ncclDevComm* comm = &ncclShmem.comm;
uint64_t entryStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_BROADCAST_ENTRY, 0, entryStamp, comm->cpuTimestamp);
#endif
T *inputBuf = (T*)args->sendbuff;
T *outputBuf = (T*)args->recvbuff;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, inputBuf, outputBuf, args->redOpArg, 0, args->connIndex, args->connIndex);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = ring->index;
INIT_PRIMS_SDMA(prims, args);
#endif
#if defined(ENABLE_NPKIT)
if (tid == 0) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_BROADCAST_RING_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_BROADCAST_RING_ENTRY, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t realChunkSize;
if (Proto::Id == NCCL_PROTO_SIMPLE) {
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
}
else if (Proto::Id == NCCL_PROTO_LL)
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
else if (Proto::Id == NCCL_PROTO_LL128)
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128);
realChunkSize = int(realChunkSize);
ssize_t offset = gridOffset + int(bid*realChunkSize);
int nelem = min(realChunkSize, size-offset);
if (rank == root) {
if (inputBuf == outputBuf) {
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.send(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
} else {
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_COPY_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_BROADCAST_RING_COPY_SEND_ENTRY)
if (threadIdx.x == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_BROADCAST_RING_COPY_SEND_ENTRY, max(0, nelem)*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + blockIdx.x);
prims.npKitDataProcessTotalTime = 0;
}
#endif
prims.copySend(offset, offset, nelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_BROADCAST_RING_COPY_SEND_EXIT)
if (threadIdx.x == 0)
NpKit::CollectGpuEvent(NPKIT_EVENT_BROADCAST_RING_COPY_SEND_EXIT, max(0, nelem)*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + blockIdx.x);
#endif
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_COPY_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
} else if (nextRank == root) {
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.recv(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
} else {
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_COPY_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.recvCopySend(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_COPY_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
}
#if defined (ENABLE_TIMELINE)
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_BROADCAST_EXIT, totalElems*sizeof(T), __builtin_amdgcn_s_memrealtime() - entryStamp, comm->cpuTimestamp);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_BROADCAST_RING_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_BROADCAST_RING_EXIT, size*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<BROADCAST_CHUNKSTEPS/BROADCAST_SLICESTEPS, BROADCAST_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncBroadcast, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
}
};
/*************************************************************************
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_DEVICE_COMMON_H_
#define NCCL_DEVICE_COMMON_H_
#include "collectives.h"
#include "devcomm.h"
#if defined (ENABLE_TIMELINE)
#include "timeline/timeline.h"
#endif
#if defined(__gfx908__)
#define COLL_UNROLL 2
#else
#define COLL_UNROLL 4
#endif
#define NCCL_MAX_DEV_ARITY (NCCL_MAX_TREE_ARITY-1) // Using balanced tree instead of split tree
#define __syncwarp()
#define __synclds() \
asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier");
#ifdef __GFX9__
#define STORE(DST, SRC) \
{ __threadfence(); __atomic_store_n((DST), (SRC), __ATOMIC_RELAXED); }
#else
#define STORE(DST, SRC) \
{ __atomic_store_n((DST), (SRC), __ATOMIC_SEQ_CST); }
#endif
#ifdef ENABLE_LL128
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
#else
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
#endif
#define NCCL_FUNC4(func, devredop, type, nullify) \
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, uint8_t, 0), \
NCCL_FUNC4(func, devredop, int32_t, 0), \
NCCL_FUNC4(func, devredop, uint32_t, 0), \
NCCL_FUNC4(func, devredop, int64_t, 0), \
NCCL_FUNC4(func, devredop, uint64_t, 0), \
NCCL_FUNC4(func, devredop, half, nullForFloat), \
NCCL_FUNC4(func, devredop, float, nullForFloat), \
NCCL_FUNC4(func, devredop, double, nullForFloat), \
NCCL_FUNC4(func, devredop, rccl_bfloat16, nullForFloat)
#define NCCL_FUNCS3B(func, devredop) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0)
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(func) \
NCCL_FUNCS3A(func, Sum, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, Prod, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, Max, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, Min, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, PreMulSum, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1)
#define NCCL_FUNCS2B(func) \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum)
// Must be consistent with the ncclFuncSet enum
using ncclKernelFunc_t = void (*)();
static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if defined(__HIP_DEVICE_COMPILE__)
#if defined(BUILD_ALLREDUCE_ONLY)
NCCL_FUNC4(AllReduce, Sum, float, 0),
#else
NCCL_FUNCS2B(Broadcast),
NCCL_FUNCS2A(Reduce),
NCCL_FUNCS2B(AllGather),
NCCL_FUNCS2A(ReduceScatter),
NCCL_FUNCS2A(AllReduce),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
#if defined(RCCL_BFLOAT16)
NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16),
#endif
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
NCCL_FUNC_NAME(AllToAllPivot, RING, SIMPLE, Sum, int8_t),
#endif
#endif
};
static_assert(FUNC_INDEX_P2P == 5410, "Wrong P2P function index");
static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 5411, "Wrong AllToAllPivot function index");
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
template<unsigned short f, unsigned short l, bool u>
struct Caller {
static __forceinline__ __device__ __host__
void call(unsigned short funcIndex) noexcept
{
constexpr unsigned short m = f + (l - f) / 2;
return (funcIndex < m) ? Caller<f, m, u>::call(funcIndex) : Caller<m, l, u>::call(funcIndex);
}
};
template<unsigned short f, bool u>
struct Caller<f, f + 1, u>{
static __forceinline__ __device__ __host__
void call(unsigned short funcIndex) noexcept { ncclFuncs[f](); }
};
template<bool USING_LL128>
__forceinline__
__device__
void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept {
#if defined(BUILD_ALLREDUCE_ONLY)
if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE))
ncclFunction_AllReduce_RING_SIMPLE_Sum_float();
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL))
ncclFunction_AllReduce_RING_LL_Sum_float();
else if (USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL128))
ncclFunction_AllReduce_RING_LL128_Sum_float();
else if (!USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_RING, NCCL_PROTO_LL128))
ncclFunction_AllReduce_RING_LL_Sum_float();
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_SIMPLE))
ncclFunction_AllReduce_TREE_SIMPLE_Sum_float();
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_LL))
ncclFunction_AllReduce_TREE_LL_Sum_float();
else if (USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_LL128))
ncclFunction_AllReduce_TREE_LL128_Sum_float();
else if (!USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_TREE, NCCL_PROTO_LL128))
ncclFunction_AllReduce_TREE_LL_Sum_float();
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_SIMPLE))
ncclFunction_AllReduce_COLLNET_DIRECT_SIMPLE_Sum_float();
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_LL))
ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_float();
else if (USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_LL128))
ncclFunction_AllReduce_COLLNET_DIRECT_LL128_Sum_float();
else if (!USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_DIRECT, NCCL_PROTO_LL128))
ncclFunction_AllReduce_COLLNET_DIRECT_LL_Sum_float();
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_SIMPLE))
ncclFunction_AllReduce_COLLNET_CHAIN_SIMPLE_Sum_float();
else if (funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_LL))
ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_float();
else if (USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_LL128))
ncclFunction_AllReduce_COLLNET_CHAIN_LL128_Sum_float();
else if (!USING_LL128 && funcIndex == FUNC_INDEX(ncclFuncAllReduce, ncclSum, ncclFloat32, NCCL_ALGO_COLLNET_CHAIN, NCCL_PROTO_LL128))
ncclFunction_AllReduce_COLLNET_CHAIN_LL_Sum_float();
else
assert("Unsupported function index");
#else
if (funcIndex < 1080) {
if (funcIndex % 18 == 0) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 1) ncclFunction_Broadcast_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 1) ncclFunction_Broadcast_TREE_LL_Sum_int8_t();
else if (funcIndex % 18 == 2) ncclFunction_Broadcast_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 3) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 4) ncclFunction_Broadcast_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 4) ncclFunction_Broadcast_RING_LL_Sum_int8_t();
else if (funcIndex % 18 == 5) ncclFunction_Broadcast_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 6) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 7) ncclFunction_Broadcast_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 18 == 8) ncclFunction_Broadcast_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 9) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 10) ncclFunction_Broadcast_COLLNET_CHAIN_LL_Sum_int8_t();
else ncclFunction_Broadcast_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
}
else if (funcIndex < 2160) Caller<1080, 2160, USING_LL128>::call(funcIndex);
else if (funcIndex < 3240) {
if (funcIndex % 18 == 0) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 1) ncclFunction_AllGather_TREE_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 1) ncclFunction_AllGather_TREE_LL_Sum_int8_t();
else if (funcIndex % 18 == 2) ncclFunction_AllGather_TREE_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 3) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 4) ncclFunction_AllGather_RING_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 4) ncclFunction_AllGather_RING_LL_Sum_int8_t();
else if (funcIndex % 18 == 5) ncclFunction_AllGather_RING_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 6) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 7) ncclFunction_AllGather_COLLNET_DIRECT_LL_Sum_int8_t();
else if (funcIndex % 18 == 8) ncclFunction_AllGather_COLLNET_DIRECT_SIMPLE_Sum_int8_t();
else if (funcIndex % 18 == 9) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else if (USING_LL128 && funcIndex % 18 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL128_Sum_int8_t();
else if (!USING_LL128 && funcIndex % 18 == 10) ncclFunction_AllGather_COLLNET_CHAIN_LL_Sum_int8_t();
else ncclFunction_AllGather_COLLNET_CHAIN_SIMPLE_Sum_int8_t();
}
else if (funcIndex < 5400) Caller<3240, 5400, USING_LL128>::call(funcIndex);
else {
switch (funcIndex - 5400) {
case 0:
ncclFunction_OneRankReduce_PreMulSum_int8_t();
break;
case 1:
ncclFunction_OneRankReduce_PreMulSum_uint8_t();
break;
case 2:
ncclFunction_OneRankReduce_PreMulSum_int32_t();
break;
case 3:
ncclFunction_OneRankReduce_PreMulSum_uint32_t();
break;
case 4:
ncclFunction_OneRankReduce_PreMulSum_int64_t();
break;
case 5:
ncclFunction_OneRankReduce_PreMulSum_uint64_t();
break;
case 6:
ncclFunction_OneRankReduce_PreMulSum_half();
break;
case 7:
ncclFunction_OneRankReduce_PreMulSum_float();
break;
case 8:
ncclFunction_OneRankReduce_PreMulSum_double();
break;
case 9:
ncclFunction_OneRankReduce_PreMulSum_rccl_bfloat16();
break;
case 10:
ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t();
break;
case 11:
ncclFunction_AllToAllPivot_RING_SIMPLE_Sum_int8_t();
default:
break;
}
}
#endif
}
#endif
template <ncclFunc_t FUNCTION, int ALGO, int PROTO, class REDOP, typename T, int UNROLL>
class ncclFunction {
public:
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {}
#else
__device__ void run(struct ncclWorkElem* args) {}
#endif
};
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
#define __trace_hwreg()
#else
#define __trace_hwreg() \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (collTrace->data_0));
#endif
#ifdef ENABLE_COLLTRACE
#define INC_COLL_TRACE \
uint32_t pos = atomicAdd(&ncclShmem.collTraceTail->tail, 1)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x;
// TODO: switch to atomicInc after llvm crash is fixed
// uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS)
#define traceKernelLaunch(launch_type) { \
INC_COLL_TRACE \
collTrace->funcIndex = ncclShmem.work.header.funcIndex; \
__trace_hwreg()\
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
collTrace->p2p[0].connIndex = 0; \
collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
collTrace->p2p[0].ngroups = p2pElems[0].ngroups; \
collTrace->p2p[0].nWarps = p2pElems[0].nWarps; \
collTrace->p2p[0].warpStart = p2pElems[0].warpStart; \
collTrace->p2p[0].peer = p2pElems[0].p2pType == ncclWorkP2pTypeRecv ? (uint16_t)(p2pElems[0].peer) : -1; \
collTrace->p2p[1].connIndex = 0; \
collTrace->p2pOpCount[1] = p2pElems[1].opCount; \
collTrace->p2p[1].ngroups = p2pElems[1].ngroups; \
collTrace->p2p[1].nWarps = p2pElems[1].nWarps; \
collTrace->p2p[1].warpStart = p2pElems[1].warpStart; \
collTrace->p2p[1].peer = p2pElems[1].p2pType == ncclWorkP2pTypeSend ? (uint16_t)(p2pElems[1].peer) : -1; \
collTrace->type = (launch_type) | ncclCollTraceP2pElemType; \
} else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \
struct ncclWorkElem *elems = ncclShmem.work.elems; \
collTrace->opCount = elems[0].opCount; \
collTrace->coll.nWarps = elems[0].nWarps; \
collTrace->coll.bid = elems[0].bid; \
collTrace->coll.nChannels = elems[0].nChannels; \
collTrace->type = (launch_type) | ncclCollTraceCollElemType; \
} \
}
#define traceKernelEnd(end_type) { \
INC_COLL_TRACE \
if (ncclShmem.work.header.type == ncclWorkTypeP2p) { \
struct ncclWorkElemP2p *p2pElems = ncclShmem.work.p2pElems; \
collTrace->p2pOpCount[0] = p2pElems[0].opCount; \
collTrace->p2pOpCount[1] = p2pElems[1].opCount; \
} else if (ncclShmem.work.header.type == ncclWorkTypeColl) { \
struct ncclWorkElem *elems = ncclShmem.work.elems; \
collTrace->opCount = elems[0].opCount; \
} \
collTrace->type = end_type; \
}
#define traceData(data2, data4, data8_0, data8_1) { \
INC_COLL_TRACE \
collTrace->funcIndex = data2; \
collTrace->data_0 = data4; \
collTrace->opCount = data8_0; \
collTrace->data_1 = data8_1; \
collTrace->type = ncclCollTraceDataType; \
}
#else
#define traceKernelLaunch(launch_type)
#define traceKernelEnd(end_type)
#define traceData(data2, data4, data8_0, data8_1)
#endif
struct ncclShmemGroup {
ncclConnInfo *recvConns[NCCL_MAX_NVLS_ARITY];
ncclConnInfo *sendConns[NCCL_MAX_NVLS_ARITY];
void* srcs[NCCL_MAX_NVLS_ARITY+1];
void* dsts[NCCL_MAX_NVLS_ARITY+1];
uint64_t barrier;
uint64_t barrier_next[NCCL_MAX_GROUPS];
};
#define LDS_NUM_EVENTS 64
struct ncclShmemData {
struct ncclShmemGroup groups[NCCL_MAX_GROUPS];
uint64_t redOpArgs[NCCL_MAX_NVLS_ARITY+1];
int channelId;
int aborted;
alignas(16) struct ncclDevComm comm;
alignas(16) struct ncclDevChannel channel;
alignas(16) struct ncclWork work;
#ifdef ENABLE_COLLTRACE
struct ncclCollTrace* collTrace;
union ncclCollTraceTail* collTraceTail;
#endif
#ifdef ENABLE_PROFILING
struct ncclProf prof;
#endif
#if defined(ENABLE_NPKIT)
NpKitEvent event_buffer[LDS_NUM_EVENTS];
uint64_t event_buffer_head;
#endif
};
static_assert(offsetof(struct ncclShmemData, work)%16 == 0, "ncclShmem.work needs to be 16B aligned");
extern __shared__ ncclShmemData ncclShmem;
#if __CUDA_ARCH__ >= 700
extern __shared__ ulong2 ncclShmemPerWarp[/*ncclShmemDynamicSize()/sizeof(ulong2)*/];
#else
extern __shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
#endif
__device__ inline void* ncclScratchForWarp(int warp) {
return (char*)ncclShmemPerWarp + warp*ncclShmemScratchWarpSize();
}
#ifdef ENABLE_PROFILING
#define __insert_timestamp(line_num) do { \
if (ncclShmem.prof.count < PROFILE_NUM_ITEMS) { \
ncclShmem.prof.elem[ncclShmem.prof.count].line = line_num; \
ncclShmem.prof.elem[ncclShmem.prof.count].timeStamp = wall_clock64(); \
ncclShmem.prof.count++; \
} \
} while(0);
#else
#define __insert_timestamp(line_num)
#endif
// Copy 16-byte aligned data. You must call with at least `(bytes+15)/16` threads.
inline __device__ void copyToShmem16(int tid, void* dst, void const* src, int bytes) {
int offset = 16*tid;
if (offset < bytes) {
ulong2 *src2, *dst2;
src2 = (ulong2*)((char const*)src + offset);
dst2 = (ulong2*)((char*)dst + offset);
dst2->x = src2->x;
dst2->y = src2->y;
}
}
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWorkElement {
__device__ void run(ncclWorkElem*) {
// Put NOT IMPLEMENTED behavior here.
}
};
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto>
struct RunWork {
// This __forceinline__ is necessary. The compiler was inserting a function call
// here from the LL ncclKernel.
__device__ __forceinline__ void run(ncclWork *w) {
int wid = threadIdx.x / WARP_SIZE;
ncclWorkElem* we = w->header.type == ncclWorkTypeRegColl ? &w->regElems[0].elem : &w->elems[0];
int stride = w->header.type == ncclWorkTypeRegColl ? sizeof(ncclWorkElemReg) : sizeof(ncclWorkElem);
#pragma unroll 1
while ((char*)we + stride <= (char*)(w+1) && we->isUsed) {
if (wid < we->nWarps) {
RunWorkElement<Fn, T, RedOp, Algo, Proto>().run(we);
}
we = (ncclWorkElem*)((char*)we + stride);
}
}
};
static __forceinline__ __device__ void ncclRedopPtrDeref(struct ncclWorkElem* we) {
if (we->isUsed && we->redOpArgIsPtr) {
/* redOpArg is a pointer to the scalar value, so we'll dereference it
* here so that redOpArg holds the bits of the scalar going forward.
* The tricky thing is we don't know its type T since that's encoded in
* the funcIndex. Because it would be difficult to get sizeof(T) from
* funcIndex, we'll cheat and just dereference the largest possible size
* given the alignment of the pointer. We might be reading in more bytes
* than we need but that's harmless.
*/
if (we->redOpArg%2 != 0)
we->redOpArg = *reinterpret_cast<uint8_t*>(we->redOpArg);
else if (we->redOpArg%4 != 0)
we->redOpArg = *reinterpret_cast<uint16_t*>(we->redOpArg);
else if (we->redOpArg%8 != 0)
we->redOpArg = *reinterpret_cast<uint32_t*>(we->redOpArg);
else
we->redOpArg = *reinterpret_cast<uint64_t*>(we->redOpArg);
}
}
template<ncclFunc_t Fn, typename T, typename RedOp, int Algo, int Proto, int FnIndex, bool COLLTRACE>
__forceinline__ __device__ void ncclKernel(
struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead
) {
#if defined (ENABLE_TIMELINE)
uint8_t event_type = workHead->header.funcIndex == FUNC_INDEX_P2P ? TIMELINE_EVENT_P2P_ENTRY : TIMELINE_EVENT_COLL_ENTRY;
Timeline::CollectGpuEvent(comm->gpuEventContext, event_type, workHead->header.funcIndex, comm->cpuTimestamp);
#endif
const int tid = threadIdx.x;
int x = tid;
switch (tid/WARP_SIZE) {
case 0:
if (channelMask & (1ull<<x)) {
int y = __popcll(channelMask & ((1ull<<x)-1));
if (blockIdx.x == y) ncclShmem.channelId = x;
}
if (32 < MAXCHANNELS) {
x = 32 + tid;
if (channelMask & (1ull<<x)) {
int y = __popcll(channelMask & ((1ull<<x)-1));
if (blockIdx.x == y) ncclShmem.channelId = x;
}
}
break;
case 1:
if (tid < WARP_SIZE + NCCL_MAX_GROUPS)
ncclShmem.groups[tid-WARP_SIZE].barrier = 0;
break;
case 2:
if (tid < 2*WARP_SIZE + NCCL_MAX_GROUPS*NCCL_MAX_GROUPS)
ncclShmem.groups[(tid-2*WARP_SIZE)/NCCL_MAX_GROUPS].barrier_next[(tid-2*WARP_SIZE)%NCCL_MAX_GROUPS] = 0;
break;
case 3:
/* set abort flag to 0 */
if (tid == 3*WARP_SIZE) ncclShmem.aborted = 0;
break;
default:
break;
}
__synclds(); // publish ncclShmem.channelId
// To map blockId to channelId, we need the n'th set bit of channelMask which
// is the inverse of counting the number of set bits among the the first n.
int channelId = ncclShmem.channelId;
if (true) {
void *dst, *src;
int bytes;
// Use first 3 warps to load comm, channel, and work into shmem
switch (tid/WARP_SIZE) {
case 0:
dst = &ncclShmem.comm;
src = comm;
bytes = sizeof(ncclDevComm);
static_assert(sizeof(ncclDevComm) <= 16*WARP_SIZE, "ncclDevComm cannot be loaded by a single warp in one insn.");
break;
case 1:
// Get address of channel without incurring indirect load from ncclDevComm::channels
dst = &ncclShmem.channel;
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
bytes = sizeof(ncclDevChannel);
static_assert(sizeof(ncclDevChannel) <= 16*WARP_SIZE, "ncclDevChannel cannot be loaded by a single warp in one insn.");
break;
case 2:
dst = &ncclShmem.work;
src = workHead + blockIdx.x;
bytes = sizeof(ncclWork);
static_assert(sizeof(ncclWork) <= 16*WARP_SIZE, "ncclWork cannot be loaded by a single warp in one insn.");
break;
default:
bytes = 0;
break;
}
copyToShmem16(tid%WARP_SIZE, dst, src, bytes);
}
#ifdef ENABLE_COLLTRACE
if (tid == 0) {
ncclShmem.collTrace = comm->collTrace + COLLTRACE_NUM_ITEMS*ncclShmem.channelId;
ncclShmem.collTraceTail = comm->collTraceTail + ncclShmem.channelId;
}
#endif
__synclds(); // publish shmem
#ifdef ENABLE_PROFILING
if (tid == 0) {
ncclShmem.prof.count = 0;
ncclShmem.prof.seq = ncclShmem.comm.devProf[blockIdx.x].seq;
}
#endif
if (tid == 0) __insert_timestamp(__LINE__);
if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceKernelLaunchType);
while (true) {
// Notify host that all fifo reads are complete.
if (tid == 0 && ncclShmem.work.header.isLast && ncclShmem.work.header.inFifo) {
*ncclShmem.channel.workFifoDone = ncclShmem.work.header.doneAcks;
}
__syncwarp();
if (ncclShmem.work.header.type == ncclWorkTypeColl) {
if (tid < NCCL_MAX_WORK_ELEMENTS) ncclRedopPtrDeref(&ncclShmem.work.elems[tid]);
} else if (ncclShmem.work.header.type == ncclWorkTypeRegColl) {
if (tid < NCCL_MAX_WORK_ELEMENTS_REG) ncclRedopPtrDeref(&ncclShmem.work.regElems[tid].elem);
}
__synclds();
if (tid == 0) __insert_timestamp(__LINE__);
if (ncclShmem.work.header.funcIndex == FnIndex) {
RunWork<Fn, T, RedOp, Algo, Proto>().run(&ncclShmem.work);
} else {
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
ncclFuncs[ncclShmem.work.header.funcIndex]();
#else
#ifdef ENABLE_LL128
NCCL_CALL_FUNCTIONS<1>(ncclShmem.work.header.funcIndex);
#else
NCCL_CALL_FUNCTIONS<0>(ncclShmem.work.header.funcIndex);
#endif
#endif
}
int workIxNext = ncclShmem.work.header.workNext;
__synclds();
if (ncclShmem.work.header.isLast) break;
copyToShmem16(tid, &ncclShmem.work, workHead + workIxNext, sizeof(ncclWork));
{ // Check whether the last operation was aborted and make sure all threads exit
int aborted = tid == 0 ? *comm->abortFlag : 0;
if (__any(aborted)) { // publish ncclShmem.work
traceKernelEnd(ncclCollTraceAbortType);
break;
}
}
if (COLLTRACE && tid == 0) traceKernelLaunch(ncclCollTraceCollLaunchType);
}
if (COLLTRACE && tid == 0) traceKernelEnd(ncclCollTraceKernelEndType);
#if defined (ENABLE_TIMELINE)
event_type = workHead->header.funcIndex == FUNC_INDEX_P2P ? TIMELINE_EVENT_P2P_EXIT : TIMELINE_EVENT_COLL_EXIT;
Timeline::CollectGpuEvent(comm->gpuEventContext, event_type, workHead->header.funcIndex, comm->cpuTimestamp);
#endif
#ifdef ENABLE_PROFILING
if (ncclShmem.comm.devProf->seq < PROFILE_NUM_LAUNCHES) {
__synclds();
copyToShmem16(tid, ncclShmem.comm.devProf+MAXCHANNELS*ncclShmem.prof.seq+blockIdx.x, &ncclShmem.prof, sizeof(struct ncclProf));
if (tid == 0) ncclShmem.comm.devProf[blockIdx.x].seq++;
}
#endif
}
#ifdef ENABLE_COLLTRACE
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false>(comm, channelMask, workHead); \
} \
\
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, true>(comm, channelMask, workHead); \
}
#else
#define IMPL_COLL_KERN(func, algo, proto, devredop, type, fIndex) \
__launch_bounds__(NCCL_MAX_NTHREADS, 1) \
__global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead) { \
ncclKernel<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto, fIndex, false>(comm, channelMask, workHead); \
}
#endif
// Examples : AllReduce, RING, LL, Sum, uint8
/* Functions for aggregation case */
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
__device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
}
#else
#define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \
__device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \
RunWork<ncclFunc##func, type, Func##devredop<type>, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \
}
#endif
// Only generate inline kernels for LL
#ifdef ENABLE_LL128
#define IMPL_COLL4(func, algo, devredop, type) \
IMPL_COLL_FUNC(func, algo, LL, devredop, type) \
IMPL_COLL_FUNC(func, algo, LL128, devredop, type) \
IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type)
#else
#define IMPL_COLL4(func, algo, devredop, type) \
IMPL_COLL_FUNC(func, algo, LL, devredop, type) \
IMPL_COLL_FUNC(func, algo, SIMPLE, devredop, type)
#endif
#define IMPL_COLL3(func, devredop, type) \
IMPL_COLL4(func, TREE, devredop, type) \
IMPL_COLL4(func, RING, devredop, type) \
IMPL_COLL4(func, COLLNET_DIRECT, devredop, type) \
IMPL_COLL4(func, COLLNET_CHAIN, devredop, type) \
IMPL_COLL4(func, NVLS, devredop, type) \
IMPL_COLL4(func, NVLS_TREE, devredop, type)
#define IMPL_COLL2(func, devredop) \
IMPL_COLL3(func, devredop, int8_t) \
IMPL_COLL3(func, devredop, uint8_t) \
IMPL_COLL3(func, devredop, int32_t) \
IMPL_COLL3(func, devredop, uint32_t) \
IMPL_COLL3(func, devredop, int64_t) \
IMPL_COLL3(func, devredop, uint64_t) \
IMPL_COLL3(func, devredop, half) \
IMPL_COLL3(func, devredop, float) \
IMPL_COLL3(func, devredop, double) \
IMPL_COLL3(func, devredop, rccl_bfloat16)
#define IMPL_COLL2A(func, devredop) \
IMPL_COLL3(func, devredop, int8_t) \
IMPL_COLL3(func, devredop, uint8_t) \
IMPL_COLL3(func, devredop, int32_t) \
IMPL_COLL3(func, devredop, uint32_t) \
IMPL_COLL3(func, devredop, int64_t) \
IMPL_COLL3(func, devredop, uint64_t)
// Reduction define all functions
#define IMPL_COLL_R(func) \
IMPL_COLL2(func, Sum) \
IMPL_COLL2(func, Prod) \
IMPL_COLL2(func, Min) \
IMPL_COLL2(func, Max) \
IMPL_COLL2(func, PreMulSum) \
IMPL_COLL2A(func, SumPostDiv)
// Copy primitives only define one function for copy
#ifdef ENABLE_KERNELNAME
#define IMPL_COLL_C(func) \
IMPL_COLL3(func, Sum, int8_t);\
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
#else
#define IMPL_COLL_C(func) \
IMPL_COLL3(func, Sum, int8_t);
#endif
// Point-to-point primitives only have one function/kernel.
#define IMPL_COLL_P(func) \
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t); \
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
// AllToAll Pivot primitive only has one function.
#ifdef ENABLE_KERNELNAME
#define IMPL_COLL_F(func) \
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t);\
IMPL_COLL_KERN(func, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);
#else
#define IMPL_COLL_F(func) \
IMPL_COLL_FUNC(func, RING, SIMPLE, Sum, int8_t);
#endif
#define NCCL_NVLS_ENABLED (__CUDA_ARCH__ >= 900 && NCCL_NVLS_SUPPORTS(NCCL_TYPE, NCCL_OP))
#endif
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_COMMON_KERNEL_H_
#define NCCL_COMMON_KERNEL_H_
#include "devcomm.h"
#include "op128.h"
#include "reduce_kernel.h"
#include <cstdio>
#include <cstdint>
#include <hip/hip_runtime.h>
#define __syncwarp()
#define SDMA_SPEC_DST 0x55
// Define min for ssize_t
inline __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; }
inline __device__ int loadInt(int* ptr) {
int v;
v = atomicAdd((unsigned long long *)ptr, 0);
return v;
}
template<typename RedFn, typename T, int Unroll, int BytePerPack,
int MultimemSrcs, int MinSrcs, int MaxSrcs,
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
typename IntBytes>
__device__ __forceinline__ void reduceCopyPacks(
int nThreads, int &thread,
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
IntBytes &nBytesBehind, IntBytes &nBytesAhead
) {
static_assert(std::is_signed<IntBytes>::value, "IntBytes must be a signed integral type.");
//if (BytePerPack == 0) __trap();
// A hunk is the amount of contiguous data a warp consumes per loop iteration
// assuming all threads partake.
constexpr int BytePerHunk = Unroll*WARP_SIZE*BytePerPack;
int nWarps = nThreads/WARP_SIZE;
int warp = thread/WARP_SIZE;
int lane = thread%WARP_SIZE;
// This thread's initial position.
IntBytes threadBytesBehind = nBytesBehind + (warp*BytePerHunk + lane*BytePerPack);
IntBytes threadBytesAhead = nBytesAhead - (warp*BytePerHunk + lane*BytePerPack);
// Number of hunks to be consumed over all warps.
IntBytes nHunksAhead = nBytesAhead/(BytePerHunk + !BytePerHunk);
// Advance collective position.
nBytesBehind += nHunksAhead*BytePerHunk;
nBytesAhead -= nHunksAhead*BytePerHunk;
if (Unroll==1 && BytePerPack <= nBytesAhead) {
// Only Unroll=1 can do partial hunks (where not all threads partake).
nHunksAhead += 1;
nBytesBehind += nBytesAhead - (nBytesAhead%(BytePerPack + !BytePerPack));
nBytesAhead = nBytesAhead%(BytePerPack + !BytePerPack);
}
nHunksAhead -= warp;
RedFn redFn(redArg);
uintptr_t minSrcs[MinSrcs + !MinSrcs];
uintptr_t minDsts[MinDsts + !MinDsts];
#pragma unroll
for (int s=0; s < MinSrcs; s++)
minSrcs[s] = cvta_to_global(srcPtrs[s]) + threadBytesBehind;
#pragma unroll
for (int d=0; d < MinDsts; d++)
minDsts[d] = cvta_to_global(dstPtrs[d]) + threadBytesBehind;
// We dictate loop termination condition according to whether partial hunks
// can be handled or not.
while (Unroll==1 ? (BytePerPack <= threadBytesAhead) : (0 < nHunksAhead)) {
BytePack<BytePerPack> acc[Unroll];
{ RedFn preFn(0 < PreOpSrcs ? preOpArgs[0] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (0 < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[0]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
acc[u] = ld_volatile_global<BytePerPack>(minSrcs[0]);
}
minSrcs[0] += WARP_SIZE*BytePerPack;
if (0 < PreOpSrcs) acc[u] = applyPreOp(preFn, acc[u]);
}
}
#pragma unroll Unroll
for (int s=1; s < MinSrcs; s++) {
BytePack<BytePerPack> tmp[Unroll];
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (s < MultimemSrcs) {
// applyLoadMultimem uses relaxed semantics for same reason we use volatile below.
acc[u] = applyLoadMultimem<RedFn, BytePerPack>(preFn, minSrcs[s]);
} else {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
tmp[u] = ld_volatile_global<BytePerPack>(minSrcs[s]);
}
minSrcs[s] += WARP_SIZE*BytePerPack;
}
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
acc[u] = applyReduce(redFn, acc[u], tmp[u]);
}
}
for (int s=MinSrcs; (MinSrcs < MaxSrcs) && (s < MaxSrcs) && (s < nSrcs); s++) {
uintptr_t src = cvta_to_global(srcPtrs[s]) + threadBytesBehind;
BytePack<BytePerPack> tmp[Unroll];
RedFn preFn(s < PreOpSrcs ? preOpArgs[s] : 0);
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
// Use volatile loads in case credits are polled for with volatile (instead of acquire).
tmp[u] = ld_volatile_global<BytePerPack>(src);
src += WARP_SIZE*BytePerPack;
}
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (s < PreOpSrcs) tmp[u] = applyPreOp(preFn, tmp[u]);
acc[u] = applyReduce(redFn, acc[u], tmp[u]);
}
}
if (postOp) {
#pragma unroll Unroll
for (int u=0; u < Unroll; u++)
acc[u] = applyPostOp(redFn, acc[u]);
}
#pragma unroll Unroll
for (int d=0; d < MinDsts; d++) {
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
if (d < MultimemDsts) {
multimem_st_global(minDsts[d], acc[u]);
} else {
st_global<BytePerPack>(minDsts[d], acc[u]);
}
minDsts[d] += WARP_SIZE*BytePerPack;
}
}
for (int d=MinDsts; (MinDsts < MaxDsts) && (d < MaxDsts) && (d < nDsts); d++) {
uintptr_t dst = cvta_to_global(dstPtrs[d]) + threadBytesBehind;
#pragma unroll Unroll
for (int u=0; u < Unroll; u++) {
st_global<BytePerPack>(dst, acc[u]);
dst += WARP_SIZE*BytePerPack;
}
}
nWarps = nThreads/WARP_SIZE;
#pragma unroll
for (int s=0; s < MinSrcs; s++) minSrcs[s] += (nWarps-1)*BytePerHunk;
#pragma unroll
for (int d=0; d < MinDsts; d++) minDsts[d] += (nWarps-1)*BytePerHunk;
threadBytesBehind += nWarps*BytePerHunk;
threadBytesAhead -= nWarps*BytePerHunk;
nHunksAhead -= nWarps;
}
nWarps = nThreads/WARP_SIZE;
warp = thread/WARP_SIZE;
lane = thread%WARP_SIZE;
// The last loop iteration could have been partial, i.e. not taken by all
// threads. The threads that weren't included need an extra subtraction to
// make the value warp uniform.
if (Unroll==1 && nHunksAhead > 0) nHunksAhead -= nWarps;
// Rotate warps so the warp which got the least work here will be warp 0.
// This effectively assigns: warp = (warp-nHunks+nWarps)%nWarps;
warp = -nHunksAhead;
thread = warp*WARP_SIZE + lane;
}
template<int Unroll, typename RedFn, typename T,
int MultimemSrcs, int MinSrcs, int MaxSrcs,
int MultimemDsts, int MinDsts, int MaxDsts, int PreOpSrcs,
typename IntBytes>
__device__ __forceinline__ void reduceCopy(
int thread, int nThreads,
uint64_t redArg, uint64_t *preOpArgs, bool postOp,
int nSrcs, void **srcPtrs, int nDsts, void **dstPtrs,
IntBytes nElts
) {
static_assert(MultimemSrcs <= MinSrcs && MultimemDsts <= MinDsts, "Multimem pointers cannot exceed respective Min values.");
//int nWarps = nThreads/WARP_SIZE;
//int warp = thread/WARP_SIZE;
int lane = thread%WARP_SIZE;
// If a multimem src is present then our biggest pack size is limited to what
// is supported for this redfn/type.
constexpr int BigPackSize = (MultimemSrcs == 0) ? 16 : LoadMultimem_BigPackSize<RedFn>::BigPackSize;
IntBytes nBytesBehind = 0;
IntBytes nBytesAhead = nElts*sizeof(T);
#if __cpp_if_constexpr
if constexpr (BigPackSize > sizeof(T)) {
#else
if (BigPackSize > sizeof(T)) {
#endif
// Check that all pointers are BigPackSize aligned.
bool aligned = true;
if (lane < nSrcs) aligned &= 0 == cvta_to_global(srcPtrs[lane]) % (BigPackSize + !BigPackSize);
if (lane < nDsts) aligned &= 0 == cvta_to_global(dstPtrs[lane]) % (BigPackSize + !BigPackSize);
aligned = !(__any(!aligned));
if (aligned) {
#if defined(__gfx90a__)
reduceCopyPacks<RedFn, T, ((MinSrcs > 1) ? 2 : Unroll), BigPackSize,
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts == SDMA_SPEC_DST ? 1 : MinDsts, MaxDsts, PreOpSrcs>
(nThreads, thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, nBytesBehind, nBytesAhead);
#else
reduceCopyPacks<RedFn, T, Unroll*((MinSrcs == 1 && MinDsts == 1) ? 2 : 1), BigPackSize,
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts == SDMA_SPEC_DST ? 1 : MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
#endif
if (nBytesAhead == 0) return;
reduceCopyPacks<RedFn, T, /*Unroll=*/1, BigPackSize,
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts == SDMA_SPEC_DST ? 1 : MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
if (nBytesAhead == 0) return;
}
}
#if defined(__gfx90a__)
if (MinSrcs > 1) {
reduceCopyPacks<RedFn, T, Unroll/2*(16/sizeof(T))/2, sizeof(T),
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts == SDMA_SPEC_DST ? 1 : MinDsts, MaxDsts, PreOpSrcs>
(nThreads, thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, nBytesBehind, nBytesAhead);
} else {
reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts == SDMA_SPEC_DST ? 1 : MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
}
#else
reduceCopyPacks<RedFn, T, Unroll*(16/sizeof(T))/2, /*BytePerPack=*/sizeof(T),
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts == SDMA_SPEC_DST ? 1 : MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
#endif
if (nBytesAhead == 0) return;
reduceCopyPacks<RedFn, T, /*Unroll=*/1, /*BytePerPack=*/sizeof(T),
MultimemSrcs, MinSrcs, MaxSrcs, MultimemDsts, MinDsts == SDMA_SPEC_DST ? 1 : MinDsts, MaxDsts, PreOpSrcs>
(nThreads, /*&*/thread, redArg, preOpArgs, postOp,
nSrcs, srcPtrs, nDsts, dstPtrs, /*&*/nBytesBehind, /*&*/nBytesAhead);
}
#endif // COMMON_KERNEL_H_
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "common.h"
__shared__ ncclShmemData ncclShmem;
#if __CUDA_ARCH__ < 700
__shared__ ulong2 ncclShmemPerWarp[ncclShmemScratchWarpSize()*(NCCL_MAX_NTHREADS/WARP_SIZE)/sizeof(ulong2)];
#endif
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
#else
#define NCCL_FUNC5(func, algo, devredop, type, nullify) \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, LL128, devredop, type)), \
MACRO_IF(nullify, nullptr, NCCL_FUNC_NAME(func, algo, SIMPLE, devredop, type))
#define NCCL_FUNC4(func, devredop, type, nullify) \
NCCL_FUNC5(func, TREE, devredop, type, nullify), \
NCCL_FUNC5(func, RING, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_DIRECT, devredop, type, nullify), \
NCCL_FUNC5(func, COLLNET_CHAIN, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS, devredop, type, nullify), \
NCCL_FUNC5(func, NVLS_TREE, devredop, type, nullify)
#if defined(__CUDA_BF16_TYPES_EXIST__)
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, uint8_t, 0), \
NCCL_FUNC4(func, devredop, int32_t, 0), \
NCCL_FUNC4(func, devredop, uint32_t, 0), \
NCCL_FUNC4(func, devredop, int64_t, 0), \
NCCL_FUNC4(func, devredop, uint64_t, 0), \
NCCL_FUNC4(func, devredop, half, nullForFloat), \
NCCL_FUNC4(func, devredop, float, nullForFloat), \
NCCL_FUNC4(func, devredop, double, nullForFloat), \
NCCL_FUNC4(func, devredop, __nv_bfloat16, nullForFloat)
#define NCCL_FUNCS3B(func, devredop) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0)
#else
// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(func, devredop, nullForFloat) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, uint8_t, 0), \
NCCL_FUNC4(func, devredop, int32_t, 0), \
NCCL_FUNC4(func, devredop, uint32_t, 0), \
NCCL_FUNC4(func, devredop, int64_t, 0), \
NCCL_FUNC4(func, devredop, uint64_t, 0), \
NCCL_FUNC4(func, devredop, half, nullForFloat), \
NCCL_FUNC4(func, devredop, float, nullForFloat), \
NCCL_FUNC4(func, devredop, double, nullForFloat)
#define NCCL_FUNCS3B(func, devredop) \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0), \
NCCL_FUNC4(func, devredop, int8_t, 0)
#endif
// Must be consistent with ncclRedOp_t
#define NCCL_FUNCS2A(func) \
NCCL_FUNCS3A(func, Sum, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, Prod, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, Max, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, Min, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, PreMulSum, /*nullForFloat=*/0), \
NCCL_FUNCS3A(func, SumPostDiv, /*nullForFloat=*/1)
#define NCCL_FUNCS2B(func) \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum), \
NCCL_FUNCS3B(func, Sum)
// Must be consistent with the ncclFuncSet enum
__device__ ncclKern_t ncclFuncs[1+ncclNumTypes+NCCL_NUM_FUNCTIONS*ncclNumDevRedOps*ncclNumTypes*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS] = {
// Don't try to initialize the host shadow copy of this device-side global
// variable. There is no host pointer to a device-side function, which
// confuses clang. This will be fixed in the next clang release.
#if __CUDA_ARCH__
NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, half),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, float),
NCCL_ONERANK_REDUCE_NAME(PreMulSum, double),
#if defined(__CUDA_BF16_TYPES_EXIST__)
NCCL_ONERANK_REDUCE_NAME(PreMulSum, __nv_bfloat16),
#endif
NCCL_FUNCS2B(Broadcast),
NCCL_FUNCS2A(Reduce),
NCCL_FUNCS2B(AllGather),
NCCL_FUNCS2A(ReduceScatter),
NCCL_FUNCS2A(AllReduce)
#endif
};
#endif
// Workaround for https://reviews.llvm.org/D55580
__device__ void ncclWorkaroundClangD55580() {}
#!/bin/bash
#
# Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
dir=$1
datatypes="i8 u8 i32 u32 i64 u64 f16 f32 f64"
if [ "$CUDA_MAJOR" -ge 11 ]
then
datatypes+=" bf16"
fi
targets="GENOBJS := \\\\\n"
for base in sendrecv all_reduce all_gather broadcast reduce reduce_scatter; do
opn=0
for op in sum prod min max premulsum sumpostdiv; do
dtn=0
# Order must match that of the ncclDataType_t enum
for dt in ${datatypes}; do
# Generate a unique filename for each compilation unit,
# otherwise the __nv_module_id may conflict at link time
echo "${dir}/${base}_${op}_${dt}.cu : ${base}.cu"
echo " @printf \"Copying %-35s > %s\\\\n\" \$< \$@"
echo " cp \$< \$@"
echo ""
# Compile the file
echo "${dir}/${base}_${op}_${dt}.o : ${dir}/${base}_${op}_${dt}.cu ${base}.cu ${dir}/${base}.dep"
echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o"
echo " mkdir -p ${dir}"
echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc \$< -o \$@"
echo ""
targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n"
dtn=$(($dtn + 1))
done
opn=$(($opn + 1))
done
done
echo -e "$targets"
/*************************************************************************
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef MSSCLKERNELIMPL_H
#define MSSCLKERNELIMPL_H
#include "devcomm.h"
#include "primitives.h"
#include "collectives.h"
#include "msccl/msccl_struct.h"
#include "msccl/msccl_kernel.h"
extern __shared__ struct mscclShmemData mscclShmem;
#define MSCCL_MAX_ITER 65536
// flags are a 3-tuple of (workindex, gridoffset_iter, step) and it follows a lexicographical order. a threadblock is ahead of another iff its flag is ahead
#define COMPUTE_FLAG(__WORKINDEX__,__GRIDOFFSET_ITER__,__STEP__) \
MSCCL_MAX_ITER*MSCCL_MAX_NUM_STEPS*(uint64_t)__WORKINDEX__ + ((uint64_t)__GRIDOFFSET_ITER__ * MSCCL_MAX_NUM_STEPS + (uint64_t)__STEP__)
#define GET_WORKINDEX_FROM_FLAG(__FLAG__) \
(__FLAG__) / (MSCCL_MAX_ITER*MSCCL_MAX_NUM_STEPS)
#ifdef ENABLE_COLLTRACE
#define INC_COLL_TRACE \
uint32_t pos = atomicAdd(&ncclShmem.collTraceTail->tail, 1)%COLLTRACE_NUM_ITEMS; \
struct ncclCollTrace* collTrace = ncclShmem.collTrace+pos; \
collTrace->timeStamp = wall_clock64(); \
collTrace->bid = blockIdx.x;
// TODO: switch to atomicInc after llvm crash is fixed
// uint32_t pos = atomicInc(&ncclShmem.collTraceTail->tail, COLLTRACE_NUM_ITEMS)
#define traceData(data2, data4, data8_0, data8_1) { \
INC_COLL_TRACE \
collTrace->funcIndex = data2; \
collTrace->data_0 = data4; \
collTrace->opCount = data8_0; \
collTrace->data_1 = data8_1; \
collTrace->type = ncclCollTraceDataType; \
}
#else
#define traceData(data2, data4, data8_0, data8_1)
#endif
// a copy of the volatile load/store from prims_ll
template<typename U>
__device__ static U load(U *src) {
union {
U elt;
uint8_t u1;
uint16_t u2;
uint32_t u4;
uint64_t u8;
};
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if(sizeof(U) == 1)
u1 = __builtin_nontemporal_load((uint8_t*)src);
else if(sizeof(U) == 2)
u2 = __builtin_nontemporal_load((uint16_t*)src);
else if(sizeof(U) == 4)
u4 = __builtin_nontemporal_load((uint32_t*)src);
else
u8 = __builtin_nontemporal_load((uint64_t*)src);
#else
if(sizeof(U) == 1)
asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src));
else if(sizeof(U) == 2)
asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src));
else if(sizeof(U) == 4)
asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src));
else
asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src));
#endif
return elt;
}
template<typename U>
__device__ static void store(U *dst, U val) {
union {
U elt;
uint8_t u1;
uint16_t u2;
uint32_t u4;
uint64_t u8;
};
elt = val;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if(sizeof(U) == 1)
__builtin_nontemporal_store(u1, (uint8_t*)dst);
else if(sizeof(U) == 2)
__builtin_nontemporal_store(u2, (uint16_t*)dst);
else if(sizeof(U) == 4)
__builtin_nontemporal_store(u4, (uint32_t*)dst);
else
__builtin_nontemporal_store(u8, (uint64_t*)dst);
#else
if(sizeof(U) == 1)
asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4));
else if(sizeof(U) == 2)
asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2));
else if(sizeof(U) == 4)
asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4));
else
asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8));
#endif
}
inline __device__ static void barrier(int nthreads) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
assert(nthreads == NCCL_MAX_NTHREADS);
__asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier");
#else
asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15));
#endif
}
// Copy 8-byte aligned data. You must call with at least `(bytes+7)/8` threads.
inline __device__ static void copyToShmem8(int tid, void* dst, void const* src, int bytes) {
int offset = sizeof(uint32_t) * tid;
if (offset < bytes) {
uint32_t *src2 = (uint32_t*)((char const*)src + offset);
uint32_t *dst2 = (uint32_t*)((char*)dst + offset);
*dst2 = *src2;
offset += WARP_SIZE*sizeof(uint32_t);
}
}
__device__ __forceinline__ static void threadBlockCopy(
uint32_t *dst, uint32_t const *src, uint64_t size, int tid, int nthreads) {
for (int i = tid; i < size; i += nthreads) {
dst[i] = src[i];
}
}
#define MSCCL_REDUCE_UNROLL_LOOP_A(numloops) \
for (int r = 0; r < numloops; r++) { \
srcOffset = srcBaseOffset + (ssize_t)mscclShmem.mscclTB.reductionSrcOffsets[t->reductionPointer+r] * sizePerMscclChunk; \
reduceInput = load(srcPointer + srcOffset); \
o = applyReduce(redFn, reduceInput, o); \
}
#define MSCCL_REDUCE_UNROLL_LOOP_B(numloops) \
for (int r = 0; r < numloops; r++) { \
srcOffset = srcBaseOffset + (ssize_t)mscclShmem.mscclTB.reductionSrcOffsets[t->reductionPointer+r] * sizePerMscclChunk; \
srcs[r] = srcPointer + srcOffset; \
}
template<typename T, typename RedOp, typename Proto, bool fullOps>
__device__ __forceinline__ void mscclRunInterpreter(
struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
const int nthreads = NCCL_MAX_NTHREADS;
#if defined(ENABLE_NPKIT)
uint64_t timestamp_entry = 0;
if (tid == 0) {
timestamp_entry = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
// initialize mscclShmem.mscclTB
threadBlockCopy(
(uint32_t *)&mscclShmem.mscclTB, (uint32_t *)(algo->mscclTBs + bid),
sizeof(struct mscclThreadBlock) / sizeof(uint32_t), tid, nthreads);
__synclds(); // publish mscclShmem.mscclTB.channelId
// initialize ncclShmem and mscclShmem.work
int channelId = mscclShmem.mscclTB.channelId;
{
void *dst, *src;
int bytes = 0;
// Use first 3 warps to load comm, channel, and work into shmem
switch (tid/WARP_SIZE) {
case 0:
dst = &ncclShmem.comm;
src = comm;
bytes = sizeof(ncclDevComm);
break;
case 1:
// Get address of channel without incurring indirect load from ncclDevComm::channels
dst = &ncclShmem.channel;
src = &((ncclDevCommAndChannels*)comm)->channels[channelId];
bytes = sizeof(ncclDevChannel);
break;
case 2:
dst = &mscclShmem.work;
src = work + blockIdx.x;
bytes = sizeof(mscclWork);
break;
case 3:
/* set abort flag to 0 */
if (tid%WARP_SIZE == 0) ncclShmem.aborted = 0;
#ifdef ENABLE_COLLTRACE
else if (tid%WARP_SIZE == 1) ncclShmem.collTrace = comm->collTrace + COLLTRACE_NUM_ITEMS*channelId;
else if (tid%WARP_SIZE == 2) ncclShmem.collTraceTail = comm->collTraceTail + channelId;
#endif
break;
default:
break;
}
copyToShmem8(tid%WARP_SIZE, dst, src, bytes);
}
#if defined(ENABLE_NPKIT)
int npKitCtxIdx = bid;
int xcc_id = 0;
if (tid == 0) {
ncclShmem.event_buffer_head = 0;
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (xcc_id));
#endif
}
#endif
__synclds(); // publish shmem
if (fullOps && tid == 0) {
traceData(__LINE__, mscclShmem.work.fnIndex, (uint64_t)mscclShmem.work.sendBuff, 0);
}
if (tid == 0)
*mscclShmem.work.workFifoDone = mscclShmem.work.workFifoDoneAck;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (tid == 0) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_CPU, 0, xcc_id, *cpuTimestamp);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_TIME_SYNC_GPU, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
// User pointers for primitives
T* thisInput = (T*)mscclShmem.work.sendBuff;
T* thisOutput = (T*)mscclShmem.work.recvBuff;
T* thisScratch = (T*)mscclShmem.work.scratchBuffer;
int recvPeer = mscclShmem.mscclTB.recvPeer;
int sendPeer = mscclShmem.mscclTB.sendPeer;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? MSCCL_CHUNKSTEPS : 1));
int minChunkSize;
if (Proto::Id == NCCL_PROTO_LL)
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T));
if (Proto::Id == NCCL_PROTO_LL128) {
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
minChunkSize = nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2;
}
RedOp redFn(mscclShmem.work.redOpArg);
Primitives<T, RedOp, FanAsymmetric<1,1>, 1, Proto, 0> prims
(tid, nthreads, &recvPeer, &sendPeer, thisInput, thisOutput, mscclShmem.work.redOpArg);
#if defined(ENABLE_NPKIT)
if (tid == 0) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
const ssize_t sizePerMscclChunk = mscclShmem.work.sizePerMscclChunk;
uint32_t maxAllowedCount = mscclShmem.work.maxAllowedCount;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_ENTRY, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, timestamp_entry);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_ENTRY, 0, xcc_id, timestamp_entry);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_INIT_EXIT, 0, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
// msccl flags all start out with 0. this is used as a part of the flag to make sure different work items deal with different synchronization flags
// this still needs more work. when we make a way around the queue, the flag might have been set to undesired values. will be fixed in subsequent versions.
const int64_t workIndex = mscclShmem.work.workIndex;
volatile struct mscclFlag* mscclFlags = mscclShmem.work.syncFlags;
for (ssize_t gridOffset = 0, iter = 0; gridOffset < sizePerMscclChunk; gridOffset += chunkSize, iter++) {
ssize_t realChunkSize;
if (Proto::Id == NCCL_PROTO_SIMPLE) {
realChunkSize = min(chunkSize, sizePerMscclChunk-gridOffset);
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
}
else
realChunkSize = min(chunkSize, divUp(sizePerMscclChunk-gridOffset, minChunkSize)*minChunkSize);
realChunkSize = int(realChunkSize);
int nelem = min(realChunkSize, sizePerMscclChunk-gridOffset);
ssize_t srcOffset, dstOffset;
T *srcPointer, *dstPointer;
int step = 0;
for (int i = 0; i < mscclShmem.mscclTB.nSteps; i++){
struct mscclTransmission* t = &mscclShmem.mscclTB.transmissions[i];
// first wait if there is a dependence
int16_t numDependencies = t->numDependencies;
if (numDependencies > 0){
if (tid < numDependencies) {
int16_t dependentPointer = t->dependencePointer;
int8_t dependentBid = mscclShmem.mscclTB.dependentBid[dependentPointer+tid];
int16_t dependentStep = mscclShmem.mscclTB.dependentStep[dependentPointer+tid];
uint64_t goalFlag = COMPUTE_FLAG(workIndex, iter, dependentStep);
while (true){
uint64_t curFlag = __atomic_load_n(&(mscclFlags + dependentBid)->flag, __ATOMIC_RELAXED);
if (curFlag >= goalFlag && GET_WORKINDEX_FROM_FLAG(curFlag) == workIndex) break;
}
}
step += numDependencies-1;
barrier(nthreads);
}
srcPointer = (t->srcBuffer == MSCCL_INPUT_BUFFER) ? thisInput : ((t->srcBuffer == MSCCL_OUTPUT_BUFFER) ? thisOutput : thisScratch);
dstPointer = (t->dstBuffer == MSCCL_INPUT_BUFFER) ? thisInput : ((t->dstBuffer == MSCCL_OUTPUT_BUFFER) ? thisOutput : thisScratch);
prims.setDataPtrs(srcPointer, dstPointer);
int count = t->count;
for (int c = 0; c < count; c += maxAllowedCount) {
srcOffset = gridOffset + (ssize_t) (t->srcOffset+c) * sizePerMscclChunk;
dstOffset = gridOffset + (ssize_t) (t->dstOffset+c) * sizePerMscclChunk;
int thisCount = min(maxAllowedCount, count - c);
int thisNelem = nelem * thisCount;
if (t->type == MSCCL_SEND) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_SEND_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
prims.send(srcOffset, thisNelem); // LL.send is the only situation where there is no barrier at the end.
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_SEND_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
}
else if (t->type == MSCCL_RECV) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
prims.recv(dstOffset, thisNelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
}
else if (t->type == MSCCL_REDUCE) {
int numReductions = t->numReductions;
int currIdx = tid;
#if defined(__gfx942__)
if (Proto::Id == NCCL_PROTO_LL) {
#else
if (thisNelem < nthreads) {
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_REDUCE_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_REDUCE_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
#if defined(__gfx942__)
while (currIdx < thisNelem) {
#else
if (currIdx < thisNelem) {
#endif
dstOffset = gridOffset + (ssize_t) (t->dstOffset+c) * sizePerMscclChunk;
T* dstIndex = dstPointer + dstOffset + currIdx;
T reduceInput;
T o = load(dstIndex);
ssize_t srcBaseOffset = gridOffset + (ssize_t)c * sizePerMscclChunk + currIdx;
switch (numReductions) {
case 7:
#pragma unroll
MSCCL_REDUCE_UNROLL_LOOP_A(7);
break;
#if defined(__gfx90a__)
case 15:
#pragma unroll
MSCCL_REDUCE_UNROLL_LOOP_A(15);
break;
#endif
default:
MSCCL_REDUCE_UNROLL_LOOP_A(numReductions);
break;
}
store(dstIndex, o);
#if defined(__gfx942__)
currIdx += nthreads;
#endif
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_REDUCE_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_REDUCE_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
barrier(nthreads);
} else {
T* srcs[MSCCL_MAX_REDUCE_FUSION+1]; // +1 is for SIMPLE protocol as dst is added in the list of srcs
dstOffset = gridOffset + (ssize_t) (t->dstOffset+c) * sizePerMscclChunk;
T* dst = dstPointer + dstOffset;
ssize_t srcBaseOffset = gridOffset + (ssize_t)c * sizePerMscclChunk;
switch (numReductions) {
case 7:
#pragma unroll
MSCCL_REDUCE_UNROLL_LOOP_B(7);
break;
#if defined(__gfx90a__)
case 15:
#pragma unroll
MSCCL_REDUCE_UNROLL_LOOP_B(15);
break;
#endif
default:
MSCCL_REDUCE_UNROLL_LOOP_B(numReductions);
break;
}
prims.reduce(srcs, numReductions, &dst, 1, thisNelem);
}
if (c == 0) step += (numReductions-1); // only advance step once!
} else if (fullOps && t->type == MSCCL_RECV_COPY_SEND)
prims.recvCopySend(dstOffset, thisNelem);
else if (fullOps && t->type == MSCCL_RECV_REDUCE_SEND)
prims.recvReduceSend(srcOffset, thisNelem);
else if (fullOps && t->type == MSCCL_RECV_REDUCE_COPY_SEND)
prims.recvReduceCopySend(srcOffset, dstOffset, thisNelem);
else if (fullOps && t->type == MSCCL_RECV_REDUCE_COPY) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
prims.recvReduceCopy(srcOffset, dstOffset, thisNelem);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT, thisNelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
}
else if (fullOps && t->type == MSCCL_LOCAL_COPY)
prims.localCopy(srcPointer+srcOffset, dstPointer+dstOffset, thisNelem);
else
return;
}
if (t->hasDependence && tid == nthreads-1)
__atomic_store_n(&mscclFlags[bid].flag, (uint64_t) COMPUTE_FLAG(workIndex, iter, step), __ATOMIC_RELAXED);
step++;
}
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT)
if (tid == 0) {
NpKit::CollectGpuEventLDS(NPKIT_EVENT_MSCCL_RUN_EXIT, mscclShmem.work.sizePerMscclChunk*mscclShmem.work.nChunksPerLoop, xcc_id, NPKIT_GET_GPU_TIMESTAMP());
}
#endif
#if defined(ENABLE_NPKIT)
__synclds();
NpKitEventCollectContext* ctx = ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx;
copyToShmem16(tid, ctx->event_buffer+ctx->event_buffer_head, ncclShmem.event_buffer, sizeof(NpKitEvent)*ncclShmem.event_buffer_head);
if (tid == 0) ctx->event_buffer_head += ncclShmem.event_buffer_head;
#endif
if (fullOps && tid == 0) {
traceData(__LINE__, mscclShmem.work.fnIndex, (uint64_t)mscclShmem.work.sendBuff, 0);
}
}
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, type, fullOps) \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
mscclRunInterpreter<type, Func##devredop<type>, ProtoLL, fullOps>(comm, algo, work); \
} \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, LL128, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
mscclRunInterpreter<type, Func##devredop<type>, ProtoLL128, fullOps>(comm, algo, work); \
} \
__global__ void MSCCL_KERNEL_ENTRY_NAME(devredop, type, Simple, fullOps)(struct ncclDevComm* comm, struct mscclAlgo* algo, struct mscclWork* work) { \
mscclRunInterpreter<type, Func##devredop<type>, ProtoSimple<MSCCL_CHUNKSTEPS/MSCCL_SLICESTEPS, MSCCL_SLICESTEPS>, fullOps>(comm, algo, work); \
}
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(devredop) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t, fullOps) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t, fullOps) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t, fullOps) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t, fullOps) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t, fullOps) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t, fullOps) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, half, fullOps) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, float, fullOps) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, double, fullOps) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, rccl_bfloat16, fullOps)
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(devredop) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int8_t) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint8_t) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int32_t) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint32_t) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, int64_t) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(devredop, uint64_t)
#define MSCCL_IMPL_KERNEL_ENTRY_FUNC() \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Sum, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Prod, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Min, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(Max, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP(PreMulSum, false) \
MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_NOFLOAT(SumPostDiv, false)
#endif
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "common_kernel.h"
#include "common.h"
namespace {
template<typename T, typename RedOp>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void oneRankReduce() {
#else
__device__ __attribute__((noinline)) void oneRankReduce() {
#endif
ncclWork *w = &ncclShmem.work;
int tid = threadIdx.x;
int tn = blockDim.x;
#pragma unroll 1
for(int e=0; e < NCCL_MAX_WORK_ELEMENTS && w->elems[e].isUsed; e++) {
ncclWorkElem *we = &w->elems[e];
intptr_t eltN = we->count;
int bid = we->bid;
int bn = we->nChannels;
T const *src = (T const*)we->sendbuff;
T *dst = (T*)we->recvbuff;
// each block/channel gets a roughly equal segment of 16 byte packs
constexpr int EltPerPack = 16/sizeof(T);
intptr_t packN = (eltN + EltPerPack-1) - (eltN + EltPerPack-1)%EltPerPack;
intptr_t i0 = (bid+0)*(packN/bn) + (bid+0 < packN%bn ? bid+0 : packN%bn);
intptr_t i1 = (bid+1)*(packN/bn) + (bid+1 < packN%bn ? bid+1 : packN%bn);
i0 *= EltPerPack;
i0 = i0 < eltN ? i0 : eltN;
i1 *= EltPerPack;
i1 = i1 < eltN ? i1 : eltN;
src += i0;
dst += i0;
void *vsrc = (void*)src;
void *vdst = (void*)dst;
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/1>
(tid, tn, we->redOpArg, &(we->redOpArg), true, 1, &vsrc, 1, &vdst, i1-i0);
}
}
}
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
#define INSTANTIATE(devredop, type) \
__device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
oneRankReduce<type, Func##devredop<type>>(); \
}
#else
#define INSTANTIATE(devredop, type) \
__device__ __attribute__((noinline)) void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \
oneRankReduce<type, Func##devredop<type>>(); \
}
#endif
INSTANTIATE(PreMulSum, int8_t)
INSTANTIATE(PreMulSum, uint8_t)
INSTANTIATE(PreMulSum, int32_t)
INSTANTIATE(PreMulSum, uint32_t)
INSTANTIATE(PreMulSum, int64_t)
INSTANTIATE(PreMulSum, uint64_t)
INSTANTIATE(PreMulSum, half)
#if defined(RCCL_BFLOAT16)
INSTANTIATE(PreMulSum, rccl_bfloat16)
#endif
INSTANTIATE(PreMulSum, float)
INSTANTIATE(PreMulSum, double)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment