Commit 7dc4e964 authored by wanghan's avatar wanghan
Browse files

Initial commit: RCCL auto-tuning project

parents
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef OP128_H_
#define OP128_H_
#include <type_traits>
inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
v0 = __builtin_nontemporal_load(ptr);
v1 = __builtin_nontemporal_load(ptr+1);
}
inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
__builtin_nontemporal_store(v0, ptr);
__builtin_nontemporal_store(v1, ptr+1);
}
inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
return (uint64_t*)shmemGenericPtr;
}
inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
v0 = *(shmemAsmPtr);
v1 = *(shmemAsmPtr+1);
}
inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
*(shmemAsmPtr) = v0;
*(shmemAsmPtr+1) = v1;
}
template<typename T>
inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1) {
union {
uint32_t tmp4[4];
uint64_t tmp8[2];
};
if(sizeof(T) < 4) {
uint32_t *ptr4 = reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(ptr) & -uintptr_t(4));
#pragma unroll
for(int e=0; e < 4; e++) {
// Produce 4 bytes of sub-register type by reading 2 4-byte
// aligned values and shifting.
uint32_t lo, hi;
lo = __builtin_nontemporal_load(ptr4+e+0);
hi = __builtin_nontemporal_load(ptr4+e+1);
tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast<uintptr_t>(ptr))%4));
}
}
else if(sizeof(T) == 4) {
#pragma unroll
for(int e=0; e < 4; e++)
tmp4[e] = __builtin_nontemporal_load(ptr+e);
}
else /*sizeof(T)==8*/ {
#pragma unroll
for(int e=0; e < 2; e++)
tmp8[e] = __builtin_nontemporal_load(ptr+e);
}
v0 = tmp8[0];
v1 = tmp8[1];
}
template<typename T>
__device__ __forceinline__ uint32_t cvta_to_shared(T* ptr) {
return (uint32_t)(uint64_t)(ptr);
}
template<typename T>
__device__ __forceinline__ uintptr_t cvta_to_global(T* ptr) {
return (uintptr_t)(ptr);
}
template<typename T>
__device__ __forceinline__ T* cvta_from_shared(uint32_t shptr) {
return (T*)shptr;
}
template<typename T>
__device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) {
return (T*)gptr;
}
////////////////////////////////////////////////////////////////////////////////
// BytePack<Size>: struct of bytes.
template<int Size>
union BytePack;
template<>
union BytePack<0> {};
template<>
union BytePack<1> {
uint8_t u8, native;
};
template<>
union BytePack<2> {
BytePack<1> half[2];
uint8_t u8[2];
uint16_t u16, native;
};
template<>
union BytePack<4> {
BytePack<2> half[2];
uint8_t u8[4];
uint16_t u16[2];
uint32_t u32, native;
};
template<>
union BytePack<8> {
BytePack<4> half[2];
uint8_t u8[8];
uint16_t u16[4];
uint32_t u32[2];
uint64_t u64, native;
};
template<>
union alignas(16) BytePack<16> {
BytePack<8> half[2];
uint8_t u8[16];
uint16_t u16[8];
uint32_t u32[4];
uint64_t u64[2];
ulong2 ul2, native;
#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
inline __device__ BytePack<16>& operator=(BytePack<16> other) {
u64[0] = other.u64[0];
u64[1] = other.u64[1];
return *this;
}
#endif
};
template<typename T>
struct BytePackOf {
static constexpr int Size = sizeof(T);
using Pack = BytePack<Size>;
};
template<>
struct BytePackOf<BytePack<0>> {
static constexpr int Size = 0;
using Pack = BytePack<0>;
};
template<typename T>
__device__ __forceinline__ typename BytePackOf<T>::Pack toPack(T value) {
union { typename BytePackOf<T>::Pack p; T v; };
v = value;
return p;
}
template<typename T>
__device__ __forceinline__ T fromPack(typename BytePackOf<T>::Pack pack) {
union { typename BytePackOf<T>::Pack p; T v; };
p = pack;
return v;
}
////////////////////////////////////////////////////////////////////////////////
// Load/store of BytePack<?> using integral addresses.
template<int Size> __device__ BytePack<Size> ld_global(uintptr_t addr);
template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
//template<int Size> __device__ BytePack<Size> ld_shared(uint32_t addr);
//template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
//template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; }
template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
//template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; }
//template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; }
template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {}
//template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {}
// Used to define implementations for above prototypes.
#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
template<> \
__device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
data_cxx_ty tmp; \
tmp = *((data_cxx_ty *)addr); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
} \
template<> \
__device__ __forceinline__ BytePack<bytes> ld_volatile_##space<bytes>(addr_cxx_ty addr) { \
data_cxx_ty tmp; \
tmp = __builtin_nontemporal_load((data_cxx_ty *)addr); \
BytePack<bytes> ans; \
ans.native = tmp; \
return ans; \
} \
template<> \
__device__ __forceinline__ void st_##space<bytes>(addr_cxx_ty addr, BytePack<bytes> value) { \
data_cxx_ty tmp = value.native; \
*((data_cxx_ty *)addr) = tmp; \
}
// Single-byte types use 4-byte registers since there is no 1-byte register
// character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
DEFINE_ld_st(1, uint8_t, b8, r, global, uintptr_t, l)
//DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r)
DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l)
//DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r)
DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l)
//DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r)
DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l)
//DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
#undef DEFINE_ld_st
#define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \
template<> \
__device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
BytePack<16> ans; \
ans.u64[0] = *((uint64_t*)addr); \
ans.u64[1] = *((uint64_t*)addr+1); \
return ans; \
} \
template<> \
__device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \
BytePack<16> ans; \
ans.u64[0] = __builtin_nontemporal_load((uint64_t*)addr); \
ans.u64[1] = __builtin_nontemporal_load((uint64_t*)addr+1); \
return ans; \
} \
template<> \
__device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
*((uint64_t*)addr) = value.u64[0]; \
*((uint64_t*)addr+1) = value.u64[1]; \
}
DEFINE_ld_st_16(global, uintptr_t, l)
//DEFINE_ld_st_16(shared, uint32_t, r)
#undef DEFINE_ld_st_16
////////////////////////////////////////////////////////////////////////////////
// Atomic load/store using c++ pointers.
__device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) {
uint64_t ans;
ans = __builtin_nontemporal_load(ptr);
return ans;
}
__device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
uint64_t ans;
ans = __builtin_nontemporal_load(ptr);
return ans;
}
__device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
uint64_t ans;
ans = __atomic_load_n(ptr ,__ATOMIC_SEQ_CST);
return ans;
}
__device__ __forceinline__ void st_volatile_global(uint64_t *ptr, uint64_t val) {
__builtin_nontemporal_store(val, ptr);
}
__device__ __forceinline__ void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) {
__builtin_nontemporal_store(val, ptr);
}
__device__ __forceinline__ void st_release_sys_global(uint64_t *ptr, uint64_t val) {
__atomic_store_n(ptr, val, __ATOMIC_SEQ_CST);
}
__device__ __forceinline__ void fence_acq_rel_sys() {
//asm volatile("membar.sys;" ::: "memory");
}
__device__ __forceinline__ void fence_acq_rel_gpu() {
//asm volatile("membar.gl;" ::: "memory");
}
////////////////////////////////////////////////////////////////////////////////
// Multimem stores of BytePack<?>.
template<int Size>
__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val);
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
template<>
__device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0> val) {
// nop
}
template<>
__device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) {
asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) {
asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) {
asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory");
}
template<>
__device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) {
asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};"
:: "l"(addr), "r"(val.u32[0]), "r"(val.u32[1]), "r"(val.u32[2]), "r"(val.u32[3])
: "memory");
}
#else
template<int Size>
__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val) {
// nop
}
#endif
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
// Warp-uniform memory copy from shared address (not generic) to global memory.
// The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value
// is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes.
template<int EltSize, int MaxBytes, bool Multimem, typename IntBytes>
__device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead
) {
static_assert(std::is_signed<IntBytes>::value, "`IntBytes` must be a signed integral type.");
int nBytes = min(nBytesAhead, (IntBytes)MaxBytes);
int nFrontBytes = min(nBytes, (16 - int(dstAddr%16))%16);
int nMiddleBytes = (nBytes-nFrontBytes) & -16;
int nBackBytes = (nBytes-nFrontBytes) % 16;
{ int backLane = WARP_SIZE-1 - lane;
bool hasFront = lane*EltSize < nFrontBytes;
bool hasBack = backLane*EltSize < nBackBytes;
int offset = hasFront ? lane*EltSize : (nBytes - (backLane+1)*EltSize);
if (hasFront | hasBack) {
BytePack<EltSize> tmp = ld_shared<EltSize>(srcAddr+offset);
// Can't use multimem_st since it doesn't support EltSize==2
st_global<EltSize>(dstAddr+offset, tmp);
}
}
srcAddr += nFrontBytes;
int srcMisalign = EltSize < 4 ? (srcAddr%4) : 0;
srcAddr += -srcMisalign + lane*16;
dstAddr += nFrontBytes + lane*16;
nMiddleBytes -= lane*16;
#pragma unroll
for (int u=0; u < divUp(MaxBytes, WARP_SIZE*16); u++) {
if (nMiddleBytes <= 0) break;
union {
BytePack<4> b4[4];
BytePack<16> b16;
};
b4[0] = ld_shared<4>(srcAddr + 0*4);
b4[1] = ld_shared<4>(srcAddr + 1*4);
b4[2] = ld_shared<4>(srcAddr + 2*4);
b4[3] = ld_shared<4>(srcAddr + 3*4);
if (srcMisalign != 0) {
BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4);
b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8);
b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8);
b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8);
b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8);
}
if (Multimem) multimem_st_global<16>(dstAddr, b16);
else st_global<16>(dstAddr, b16);
srcAddr += WARP_SIZE*16;
dstAddr += WARP_SIZE*16;
nMiddleBytes -= WARP_SIZE*16;
}
}
#else
template<int EltSize, int MaxBytes, bool Multimem, typename IntBytes>
__device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead
) {
// nop
}
#endif
#endif
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_PRIMITIVES_H_
#define NCCL_PRIMITIVES_H_
#include <type_traits>
#include "reduce_kernel.h" // for reduction funcs
#include "common_kernel.h"
#include "common.h"
#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
#define barrier_by_group() do { \
if (nthreads == NCCL_MAX_NTHREADS) { \
__asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier\ns_waitcnt lgkmcnt(0)"); \
} else { \
const int w = threadIdx.x/WARP_SIZE; \
const int wid = threadIdx.x%WARP_SIZE; \
__threadfence(); \
if (wid == 0) { \
barrier_next[w] += nthreads/WARP_SIZE; \
atomicAdd((unsigned long long *)barriers, 1); \
while (atomicAdd((unsigned long long *)barriers, 0) < barrier_next[w]) __builtin_amdgcn_s_sleep(1); \
__asm__ __volatile__("s_wakeup"); \
} \
} \
} while (0)
/* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128
* We use these as template args to the Primtiives class instead of integral
* enums (e.g. NCCL_PROTO_LL) because for SIMPLE we need to carry a few extra
* numbers. Also these types hold methods which let us compute numbers important
* to how that protocol operates with a consistent interface so that our
* algorithm code can operate protocol parametrically.
*/
template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, int MultimemSrcs_1 = 0, int MultimemDsts_1 = 0>
struct ProtoSimple {
static constexpr int Id = NCCL_PROTO_SIMPLE;
static constexpr int SlicePerChunk = SlicePerChunk_1;
static constexpr int StepPerSlice = StepPerSlice_1;
static constexpr int Unroll = Unroll_1;
static constexpr int MultimemSrcs = MultimemSrcs_1;
static constexpr int MultimemDsts = MultimemDsts_1;
// Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() {
return ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
}
// Granularity of data bytes transferred per thread.
__device__ static int calcBytePerGrain() {
return sizeof(uint64_t); // Bogus value? Nobody queries this metric for simple.
}
// Group width is how many consecutive group values a subchannel occupies.
static constexpr int MaxGroupWidth = 1;
};
struct ProtoLL {
static constexpr int Id = NCCL_PROTO_LL;
// Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() {
return ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data
}
// Granularity of data bytes transferred per thread.
__device__ static int calcBytePerGrain() {
return sizeof(uint64_t); // One 16-byte line has 8-bytes of data
}
// Group width is how many consecutive group values a subchannel occupies.
static constexpr int MaxGroupWidth = 1;
};
struct ProtoLL128 {
static constexpr int Id = NCCL_PROTO_LL128;
// Data bytes (no flags etc) in one step of the fifo queue.
__device__ static int calcBytePerStep() {
return (ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS;
}
// Granularity of data bytes transferred per thread.
__device__ static int calcBytePerGrain() {
return NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_DATAELEMS*sizeof(uint64_t)/NCCL_LL128_LINEELEMS;
}
// Group width is how many consecutive group values a subchannel occupies.
static constexpr int MaxGroupWidth = 1;
};
/* Fan (as in fan-in & fan-out) classes hold recv and send counts. The template
* arguments are static bounds on the maximum values. Asymmetric counts are
* independent. Symmetric is a static guarantee that nrecv==nsend, so it only
* stores one value at runtime. This optimization save 32-bit register, but more
* importantly uses fewer predicate registers when unrolling loops.
*/
template<int MaxRecv_, int MaxSend_>
struct FanAsymmetric {
static constexpr int MaxRecv = MaxRecv_, MaxSend = MaxSend_;
int nr, ns;
FanAsymmetric() = default;
__device__ FanAsymmetric(int nrecv, int nsend): nr(nrecv), ns(nsend) {
// assert(nrecv <= MaxRecv && nsend <= MaxSend);
}
__device__ int nrecv() const { return MaxRecv ? nr : 0; }
__device__ int nsend() const { return MaxSend ? ns : 0; }
};
template<int MaxArity>
struct FanSymmetric {
static constexpr int MaxRecv = MaxArity, MaxSend = MaxArity;
int n;
FanSymmetric() = default;
__device__ FanSymmetric(int nrecv, int nsend): n(nrecv) {
// assert(nrecv == nsend && nrecv <= MaxArity);
}
__device__ int nrecv() const { return n; }
__device__ int nsend() const { return n; }
};
// The primitives class. Specialized per protocol in the other headers.
template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p>
class Primitives;
// Used by LL & LL128 to implement direct members in the naive way.
template<typename RealPrimitives>
struct PrimitivesWithoutDirect {
__device__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->send(inpIx, eltN);
}
__device__ void directSendFromOutput(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
}
__device__ void directRecv(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
}
__device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
}
__device__ void directRecvCopySend(intptr_t outIx, int eltN) {
static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
}
__device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
}
};
#include "prims_simple.h"
#include "prims_ll.h"
#include "prims_ll128.h"
#endif
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
#endif
#ifdef __GFX11__
#define LL_STORE(SRC, DST) \
__atomic_store_n((DST), (SRC), __ATOMIC_RELAXED)
#define LL_LOAD(SRC) \
__atomic_load_n(SRC, __ATOMIC_RELAXED)
#else
#define LL_STORE(SRC, DST) \
__builtin_nontemporal_store((SRC), (DST))
#define LL_LOAD(SRC) \
__builtin_nontemporal_load(SRC)
#endif
template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
// In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile
// This is because of a recv buffer which is allocated to MaxRecv length in send-only cases
static constexpr int MaxRecv = Fan::MaxRecv > 1 ? Fan::MaxRecv : 1;
static constexpr int MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
RedOp redOp;
const int tid;
const int nthreads;
const int wid;
const int group;
const int stepLines;
Fan fan;
T *userBufs[2];
struct ncclConnInfo* recvConn = NULL;
volatile uint64_t* recvConnHeadPtr = NULL;
uint64_t recvConnHead;
struct ncclConnInfo* sendConn = NULL;
volatile int* sendConnFifoPtr = NULL;
volatile uint64_t* sendConnHeadPtr = NULL;
uint64_t sendConnHead;
uint64_t sendConnHeadCache; // Cache last seen value
uint64_t recvStep[MaxRecv];
uint64_t sendStep[MaxSend];
union ncclLLFifoLine* recvBuff[MaxRecv];
union ncclLLFifoLine* sendBuff[MaxSend];
#ifdef HYGON_SDMA_FEATURE
public:
uint32_t ringIx;
uint32_t useSdmaCopy;
uint32_t sdmaMinCopySize;
uint32_t sdmaCountEnabe;
uint32_t sdmaCopyCount;
uint32_t allCopyCount;
private:
#endif
#if defined(ENABLE_NPKIT)
public:
int npKitCtxIdx = 0;
uint64_t npKitDataProcessEntryTime = 0;
uint64_t npKitDataProcessExitTime = 0;
uint64_t npKitDataProcessTotalTime = 0;
private:
#endif
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
uint64_t npKitWaitRecvDataProcessSize = 0;
uint64_t npKitWaitRecvEntryTime = 0;
uint64_t npKitWaitRecvExitTime = 0;
uint64_t npKitWaitRecvTotalTime = 0;
#endif
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; }
inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
uint64_t* barriers;
uint64_t* barrier_next;
inline __device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (nthreads != WARP_SIZE)
barrier_by_group();
#else
asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
#endif
}
uint32_t abort = 0;
inline __device__ int checkAbort(int &spins, int send) {
spins++;
if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
spins = 0;
}
return abort;
}
inline __device__ void waitSend(int nbytes) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY, nbytes, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
if (sendConnHeadPtr) {
int spins = 0;
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
__builtin_amdgcn_s_sleep(1);
sendConnHeadCache = atomicAdd((unsigned long long *)sendConnHeadPtr, 0);
if (checkAbort(spins, 1)) break;
}
__asm__ __volatile__("s_wakeup");
if (sendConnFifoPtr) {
int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
__atomic_store_n(sendConnFifoPtr+sendConnHead%NCCL_STEPS, (size), __ATOMIC_SEQ_CST);
}
sendConnHead += 1;
}
barrier();
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT, nbytes, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
inline __device__ void incRecv(int i) {
recvStep[i] += 1;
}
inline __device__ void postRecv() {
barrier();
if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += 1);
}
inline __device__ void incSend(int i, int offset) {
// LL Cleanup : write all flags in the slice to make sure we don't have
// data corruption when flag loops over.
if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
for (int o = offset; o<stepLines; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
}
sendStep[i]++;
}
__device__ uint64_t readLL(int offset, int i) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
uint32_t flag = recvFlag(i);
uint32_t data1, flag1, data2, flag2;
int spins = 0;
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
int npkitWaitRecvSpins = 0;
if (tid == 0) {
npKitWaitRecvEntryTime = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
union ncclLLFifoLine i4;
do {
i4.v[0] = LL_LOAD(src->v);
i4.v[1] = LL_LOAD(src->v+1);
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
npkitWaitRecvSpins++;
#endif
if (checkAbort(spins, 0)) break;
} while ((i4.flag1 != flag) || (i4.flag2 != flag));
uint64_t val64 = (uint64_t)(i4.data1) + (((uint64_t)i4.data2) << 32);
#else
do {
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
npkitWaitRecvSpins++;
#endif
if (checkAbort(spins, 0)) break;
} while ((flag1 != flag) || (flag2 != flag));
uint64_t val64 = data1 + (((uint64_t)data2) << 32);
#endif
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
if (tid == 0) {
npKitWaitRecvExitTime = NPKIT_GET_GPU_TIMESTAMP();
npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
}
#endif
return val64;
}
template<int BeginIx>
__device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
#pragma unroll
for (int i=BeginIx; i < MaxRecv; i++) {
if (i < fan.nrecv()) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
line[i].v[0] = LL_LOAD(src->v);
line[i].v[1] = LL_LOAD(src->v+1);
#else
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
#endif
}
}
}
__device__ uint64_t readLLFinish(int offset, ncclLLFifoLine(&line)[MaxRecv], int i) {
union ncclLLFifoLine* src = recvPtr(i) + offset;
uint32_t flag = recvFlag(i);
int spins = 0;
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
int npkitWaitRecvSpins = 0;
if (tid == 0) {
npKitWaitRecvEntryTime = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
do {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
line[i].v[0] = LL_LOAD(src->v);
line[i].v[1] = LL_LOAD(src->v+1);
#else
asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
#endif
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
npkitWaitRecvSpins++;
#endif
if (checkAbort(spins, 0)) break;
} while(line[i].flag1 != flag || line[i].flag2 != flag);
uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
if (tid == 0) {
npKitWaitRecvExitTime = NPKIT_GET_GPU_TIMESTAMP();
npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
}
#endif
return val64;
}
__device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
union ncclLLFifoLine i4;
i4.data1 = val & 0xffffffff;
i4.flag1 = flag;
i4.data2 = (val >> 32);
i4.flag2 = flag;
LL_STORE(i4.v[0], dst->v);
LL_STORE(i4.v[1], dst->v+1);
#else
asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
#endif
}
static constexpr int EltPerLine = sizeof(uint64_t)/sizeof(T);
template<typename U>
__device__ static U load(U *src) {
union {
U elt;
uint8_t u1;
uint16_t u2;
uint32_t u4;
uint64_t u8;
};
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if(sizeof(U) == 1)
u1 = LL_LOAD((uint8_t*)src);
else if(sizeof(U) == 2)
u2 = LL_LOAD((uint16_t*)src);
else if(sizeof(U) == 4)
u4 = LL_LOAD((uint32_t*)src);
else
u8 = LL_LOAD((uint64_t*)src);
#else
if(sizeof(U) == 1)
asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src));
else if(sizeof(U) == 2)
asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src));
else if(sizeof(U) == 4)
asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src));
else
asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src));
#endif
return elt;
}
template<typename U>
__device__ static void store(U *dst, U val) {
union {
U elt;
uint8_t u1;
uint16_t u2;
uint32_t u4;
uint64_t u8;
};
elt = val;
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if(sizeof(U) == 1)
LL_STORE(u1, (uint8_t*)dst);
else if(sizeof(U) == 2)
LL_STORE(u2, (uint16_t*)dst);
else if(sizeof(U) == 4)
LL_STORE(u4, (uint32_t*)dst);
else
LL_STORE(u8, (uint64_t*)dst);
#else
if(sizeof(U) == 1)
asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4));
else if(sizeof(U) == 2)
asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2));
else if(sizeof(U) == 4)
asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4));
else
asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8));
#endif
}
struct DataLoader {
int misalign;
union {
uint32_t u4[sizeof(T) <= 2 ? 3 : 2];
uint64_t u8;
T elt[EltPerLine];
};
__device__ void loadBegin(T *src, int eltN) {
if (sizeof(T) <= 2) {
misalign = reinterpret_cast<uintptr_t>(src)%4;
uint32_t *p = reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(src) & -uintptr_t(4));
u4[0] = load(p+0);
u4[1] = misalign + eltN*sizeof(T) > 4 ? load(p+1) : 0;
// u4[2] would be simpler, but that throws warnings on some compilers
u4[sizeof(T) <= 2 ? 2 : 0] = misalign + eltN*sizeof(T) > 8 ? load(p+2) : 0;
}
else {
#pragma unroll
for(int i=0; i < EltPerLine; i++) {
if(i==0 || i < eltN)
elt[i] = load(src + i);
}
}
}
__device__ uint64_t loadFinish() {
if (sizeof(T) <= 2) {
u4[0] = __funnelshift_r(u4[0], u4[1], 8*misalign);
// u4[2] would be simpler, but that throws warnings on some compilers
u4[1] = __funnelshift_r(u4[1], u4[sizeof(T) <= 2 ? 2 : 0], 8*misalign);
}
return u8;
}
};
__device__ void storeData(T *dst, uint64_t val, int eltN) {
union {
uint64_t u8;
T elt[EltPerLine];
};
u8 = val;
#pragma unroll
for(int i=0; i < EltPerLine; i++) {
if (i==0 || i < eltN)
//store(dst+i, elt[i]);
dst[i] = elt[i];
}
}
__device__ void mscclStoreData(T *dst, uint64_t val, int eltN) {
union {
uint64_t u8;
T elt[EltPerLine];
};
u8 = val;
#pragma unroll
for(int i=0; i < EltPerLine; i++) {
if (i==0 || i < eltN)
store(dst+i, elt[i]);
// dst[i] = elt[i];
}
}
template <int RECV, int SEND, int SrcBuf, int DstBuf>
__device__ void LLGenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
constexpr int SRC = SrcBuf != -1 ? 1 : 0;
constexpr int DST = DstBuf != -1 ? 1 : 0;
T *srcElts = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
T *dstElts = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx;
// Always waitSend in case of cleanup
nelem = nelem < 0 ? 0 : nelem;
if (SEND) waitSend(divUp(nelem, EltPerLine)*sizeof(ncclLLFifoLine));
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
if (tid == 0) {
npKitWaitRecvTotalTime = 0;
npKitWaitRecvDataProcessSize = nelem*sizeof(T);
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY,
npKitWaitRecvDataProcessSize, 0, NPKIT_GET_GPU_TIMESTAMP(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
npKitWaitRecvTotalTime = 0;
npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
nelem -= tid*EltPerLine;
srcElts += tid*EltPerLine;
dstElts += tid*EltPerLine;
int offset = tid;
int eltPerTrip = nthreads*EltPerLine;
while (nelem > 0) {
int eltInLine = EltPerLine < nelem ? EltPerLine : nelem;
DataLoader dl;
ncclLLFifoLine line[MaxRecv];
uint64_t data, peerData;
if (SRC) {
dl.loadBegin(srcElts, eltInLine);
srcElts += eltPerTrip;
}
if (RECV) {
readLLBeginAll<1>(offset, line);
peerData = readLL(offset, 0);
}
if (SRC) {
data = dl.loadFinish();
if (SrcBuf == Input) data = applyPreOp(redOp, data);
}
if (RECV) {
data = !SRC ? peerData : applyReduce(redOp, peerData, data);
#pragma unroll MaxRecv
for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
peerData = readLLFinish(offset, line, i);
data = applyReduce(redOp, peerData, data);
}
}
if (postOp) data = applyPostOp(redOp, data);
// Send : inter-node, then intra-node, then local
if (SEND) {
for (int i=1; i < MaxSend && i < fan.nsend(); i++)
storeLL(sendPtr(i)+offset, data, sendFlag(i));
storeLL(sendPtr(0)+offset, data, sendFlag(0));
}
if (DST) {
storeData(dstElts, data, eltInLine);
dstElts += eltPerTrip;
}
nelem -= eltPerTrip;
offset += nthreads;
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime - npKitWaitRecvTotalTime;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT,
npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
if (RECV) {
for (int i=0; i < MaxRecv; i++) incRecv(i);
postRecv();
}
if (SEND) {
for (int i=1; i < MaxSend && i < fan.nsend(); i++)
incSend(i, offset);
incSend(0, offset);
}
}
template <int REDUCE, int COPY, int MULTISRCS, int MULTIDSTS>
__device__ __forceinline__ void mscclGenericOp(T** srcs, int nsrcs, T** dsts, int ndsts, int nelem) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
nelem = nelem < 0 ? 0 : nelem;
T *srcElts = srcs[0];
T *dstElts = dsts[0];
nelem -= tid*EltPerLine;
srcElts += tid*EltPerLine;
dstElts += tid*EltPerLine;
if (MULTISRCS){
for (int i = 1; i < nsrcs; i++){
srcs[i] += tid*EltPerLine;
}
}
if (MULTIDSTS){
for (int i = 1; i < ndsts; i++){
dsts[i] += tid*EltPerLine;
}
}
int offset = tid;
int eltPerTrip = nthreads*EltPerLine;
while (nelem > 0) {
int eltInLine = EltPerLine < nelem ? EltPerLine : nelem;
DataLoader dl;
uint64_t data;
dl.loadBegin(srcElts, eltInLine);
srcElts += eltPerTrip;
data = dl.loadFinish();
if (REDUCE) {
uint64_t dataD;
dl.loadBegin(dstElts, eltInLine);
dataD = dl.loadFinish();
dataD = applyReduce(redOp, dataD, data);
if (MULTISRCS){
for (int i = 1; i < nsrcs; i++){
dl.loadBegin(srcs[i], eltInLine);
srcs[i] += eltPerTrip;
data = dl.loadFinish();
dataD = applyReduce(redOp, dataD, data);
}
}
mscclStoreData(dstElts, dataD, eltInLine);
dstElts += eltPerTrip;
}
if (COPY){
mscclStoreData(dstElts, data, eltInLine);
dstElts += eltPerTrip;
if (MULTIDSTS){
for (int i = 1; i < ndsts; i++){
dl.loadBegin(srcs[i], eltInLine);
srcs[i] += eltPerTrip;
data = dl.loadFinish();
mscclStoreData(dsts[i], data, eltInLine);
dsts[i] += eltPerTrip;
}
}
}
nelem -= eltPerTrip;
offset += nthreads;
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
barrier();
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
recvBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL];
recvStep[i] = conn->step;
if (wid == i) recvConn = conn;
}
__device__ __forceinline__ void loadRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) {
recvConnHeadPtr = recvConn->head;
recvConnHead = recvConn->step;
}
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
sendBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL];
sendStep[i] = conn->step;
if (wid == i) sendConn = conn;
}
__device__ __forceinline__ void loadSendSync() {
if (tid < fan.nsend()) {
sendConnHeadPtr = sendConn->head;
sendConnHeadCache = *sendConnHeadPtr;
sendConnHead = sendConn->step;
sendConnFifoPtr = sendConn->sizesFifo;
}
}
public:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
auto *channel = &ncclShmem.channel;
barriers = &ncclShmem.groups[group].barrier;
barrier_next = ncclShmem.groups[group].barrier_next;
// If we are going to support oneshot collNet + LL, then we would need to add connector index here
int nrecv=0, nsend=0;
// We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
nrecv++;
}
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);
loadRecvSync();
loadSendSync();
setDataPtrs(inputBuf, outputBuf);
}
__device__ ~Primitives() {
// Save steps for the next operation
if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv())
recvConn->step = recvConnHead;
if (tid < fan.nsend())
sendConn->step = sendConnHead;
// Ensure all steps written back
barrier();
}
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf) {
userBufs[Input] = (T*)inputBuf;
userBufs[Output] = (T*)outputBuf;
}
__device__ void moveDataPtrs(intptr_t delta) {
userBufs[Input] += delta;
userBufs[Output] += delta;
}
__device__ void send(intptr_t inpIx, int eltN) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
__device__ void sendFromOutput(intptr_t outIx, int eltN) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
__device__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
__device__ void recvReduceSend(intptr_t inpIx, int eltN) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
__device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
__device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
__device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
__device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
__device__ void recvSend(int eltN) {
return LLGenericOp<1, 1, -1, -1>(-1, -1, eltN, false);
}
// MSCCL primitives
__device__ void sendWithBarrier(intptr_t inpIx, int eltN) {
send(inpIx, eltN);
// This is the only primitive.instruction where there is no barrier at the end, add it
barrier();
}
__device__ void localCopy(T* srcs, T* dsts, int eltN) {
return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
}
__device__ void reduce(T** srcs, int nsrcs, T** dsts, int ndsts, int eltN) {
if (nsrcs == 1) {
return mscclGenericOp<1,0,0,0>(srcs, 1, dsts, 1, eltN);
} else {
return mscclGenericOp<1,0,1,0>(srcs, nsrcs, dsts, 1, eltN);
}
}
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
#endif
#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
#ifndef RCCL_USE_WBINVL1_VOL
#if defined(__GFX8__) || defined(__GFX9__)
#define RCCL_USE_WBINVL1_VOL 1
#else
#define RCCL_USE_WBINVL1_VOL 0
#endif
#endif
template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
RedOp redOp;
const int tid;
const int nthreads;
const int wid;
const int stepSize;
const int warp;
const int warpInBlock; // warp index in thread block
const bool flagThread;
const int group;
Fan fan;
T *userBufs[2];
struct ncclConnInfo* recvConn = NULL;
volatile uint64_t* recvConnHeadPtr = NULL;
uint64_t recvConnHead;
struct ncclConnInfo* sendConn = NULL;
volatile int* sendConnFifoPtr = NULL;
volatile uint64_t* sendConnTailPtr = NULL;
uint64_t sendConnTail;
volatile uint64_t* sendConnHeadPtr = NULL;
uint64_t sendConnHead;
uint64_t sendConnHeadCache; // Cache last seen value
uint64_t recvStep[MaxRecv];
uint64_t sendStep[MaxSend];
uint64_t* recvBuff[MaxRecv];
uint64_t* sendBuff[MaxSend];
inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
uint64_t* barriers;
uint64_t* barrier_next;
#ifdef HYGON_SDMA_FEATURE
public:
uint32_t ringIx;
uint32_t useSdmaCopy;
uint32_t sdmaMinCopySize;
uint32_t sdmaCountEnabe;
uint32_t sdmaCopyCount;
uint32_t allCopyCount;
private:
#endif
#if defined(ENABLE_NPKIT)
public:
int npKitCtxIdx = 0;
uint64_t npKitDataProcessEntryTime = 0;
uint64_t npKitDataProcessExitTime = 0;
uint64_t npKitDataProcessTotalTime = 0;
private:
#endif
inline __device__ void barrier() {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
if (nthreads != WARP_SIZE)
barrier_by_group();
#else
asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
#endif
}
uint32_t abort = 0;
uint32_t* sync;
inline __device__ int checkAbort(int &spins, int i, int send) {
spins++;
if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
abort = __atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST);
spins = 0;
}
return abort;
}
inline __device__ void waitSend(int nbytes) {
if (sendConnHeadPtr) {
int spins = 0;
while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
__builtin_amdgcn_s_sleep(1);
sendConnHeadCache = atomicAdd_system((unsigned long long *)sendConnHeadPtr, 0);
if (checkAbort(spins, wid, 1)) break;
}
__asm__ __volatile__("s_wakeup");
if (sendConnFifoPtr) {
__atomic_store_n(sendConnFifoPtr+sendStep[wid]%NCCL_STEPS, nbytes, __ATOMIC_SEQ_CST);
}
sendConnHead += 1;
}
}
inline __device__ void postRecv() {
if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += 1);
}
inline __device__ void postSend() {
if (sendConnTailPtr) { __threadfence(); STORE((unsigned long long *)sendConnTailPtr, sendConnTail += 1); }
}
template<int WordPerThread>
__device__ __forceinline__ void loadRegsBegin(uint64_t(&regs)[WordPerThread], T const *src, int eltN) {
constexpr int EltPer16B = 16/sizeof(T);
/* We are aligned to 16 bytes, so load directly to registers no shmem.
* Flag threads load half as much data which gets shuffled to the even
* registers during Finish. The point of splitting into two phases is to
* defer that shuffle, which incurs a dependency stall, until after other
* memops are launched by the caller.
*/
#pragma unroll
for(int g=0; g < WordPerThread/2; g++) {
int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4);
if(!flagThread || g%2==0) {
if(ix*EltPer16B < eltN) {
if(reinterpret_cast<uintptr_t>(src)%4 == 0) {
regs[2*g+0] = __builtin_nontemporal_load((uint64_t*)(src + ix*EltPer16B));
regs[2*g+1] = __builtin_nontemporal_load((uint64_t*)(src + ix*EltPer16B)+1);
} else {
union {
uint64_t regs64[WordPerThread];
uint32_t regs32[WordPerThread*2];
uint16_t regs16[WordPerThread*4];
uint8_t regs8[WordPerThread*8];
};
if (sizeof(T) == 8) {
uint64_t *src64 = (uint64_t*)(src+ix*EltPer16B);
for (int i=0; i < 2; i++)
regs64[2*g+i] = __builtin_nontemporal_load(src64+i);
} else if (sizeof(T) == 4) {
uint32_t *src32 = (uint32_t*)(src+ix*EltPer16B);
for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T); i++)
regs32[2*g+i] = __builtin_nontemporal_load(src32+i);
} else if (sizeof(T) == 2) {
uint16_t *src16 = (uint16_t*)(src+ix*EltPer16B);
for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T); i++)
regs16[2*g+i] = __builtin_nontemporal_load(src16+i);
} else if (sizeof(T) == 1) {
uint8_t *src8 = (uint8_t*)(src+ix*EltPer16B);
for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T); i++)
regs8[2*g+i] = __builtin_nontemporal_load(src8+i);
}
regs[2*g+0] = regs64[2*g+0];
regs[2*g+1] = regs64[2*g+1];
}
}
}
}
}
template<int WordPerThread>
__device__ __forceinline__ void loadRegsFinish(uint64_t(&regs)[WordPerThread]) {
// Move data out of flag registers into the vacant registers.
#pragma unroll
for (int g=1; g < WordPerThread/2; g+=2) {
if (flagThread) regs[2*g] = regs[2*g-1];
}
}
template<int WordPerThread>
__device__ __forceinline__ void storeRegs(T *dst, uint64_t(&regs)[WordPerThread], int eltN) {
constexpr int EltPer16B = 16/sizeof(T);
// Reverse Finish() register permuatation.
#pragma unroll
for (int g=1; g < WordPerThread/2; g+=2) {
if (flagThread) regs[2*g-1] = regs[2*g];
}
// Write to dst if 4-byte aligned, shmem otherwise.
int misalignment = reinterpret_cast<uintptr_t>(dst)%4;
#pragma unroll
for(int g=0; g < WordPerThread/2; g++) {
int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4);
if (!flagThread || g%2==0) {
if(misalignment == 0 && (ix+1)*EltPer16B <= eltN) {
__builtin_nontemporal_store(regs[2*g+0], (uint64_t*)(dst + ix*EltPer16B));
__builtin_nontemporal_store(regs[2*g+1], (uint64_t*)(dst + ix*EltPer16B)+1);
} else {
union {
uint64_t regs64[WordPerThread];
uint32_t regs32[WordPerThread*2];
uint16_t regs16[WordPerThread*4];
uint8_t regs8[WordPerThread*8];
};
regs64[2*g+0] = regs[2*g+0];
regs64[2*g+1] = regs[2*g+1];
int remaining = eltN - ix*EltPer16B;
if (sizeof(T) == 8) {
uint64_t *dst64 = (uint64_t*)(dst+ix*EltPer16B);
for (int i=0; i < 2 && i < remaining; i++)
__builtin_nontemporal_store(regs64[2*g+i], dst64+i);
} else if (sizeof(T) == 4) {
uint32_t *dst32 = (uint32_t*)(dst+ix*EltPer16B);
for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T) && i < remaining; i++)
__builtin_nontemporal_store(regs32[2*g+i], dst32+i);
} else if (sizeof(T) == 2) {
uint16_t *dst16 = (uint16_t*)(dst+ix*EltPer16B);
for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T) && i < remaining; i++)
__builtin_nontemporal_store(regs16[2*g+i], dst16+i);
} else if (sizeof(T) == 1) {
uint8_t *dst8 = (uint8_t*)(dst+ix*EltPer16B);
for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T) && i < remaining; i++)
__builtin_nontemporal_store(regs8[2*g+i], dst8+i);
}
}
}
}
}
#define WARP_MASK 0xffffffff
template <int ELEMS_PER_THREAD, int RECV, int SEND, int SrcBuf, int DstBuf>
__device__ __forceinline__ void recvReduceSendCopy(uint64_t(&v)[ELEMS_PER_THREAD], int ll128Offset, bool postOp) {
constexpr int SRC = SrcBuf != -1 ? 1 : 0;
uint64_t vr[ELEMS_PER_THREAD];
__syncwarp();
/************************ Wait first recv ********************/
if (RECV) {
uint64_t* ptr = recvPtr(0)+ll128Offset;
uint64_t flag = recvFlag(0);
bool needReload;
int spins = 0;
do {
needReload = false;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
vr[u] = __builtin_nontemporal_load(ptr+u*WARP_SIZE);
vr[u+1] = __builtin_nontemporal_load(ptr+u*WARP_SIZE+1);
needReload |= flagThread && (vr[u+1] != flag);
}
needReload &= (0 == checkAbort(spins, 0, 0));
} while (__any(needReload));
}
/************* Finish register load **************/
if (SRC) {
// By deferring register shuffle here we've overlapped spinning on first
// peer's data with memory loads of src data.
loadRegsFinish(v);
if (SrcBuf == Input) {
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = applyPreOp(redOp, v[u]);
if (!flagThread)
v[u+1] = applyPreOp(redOp, v[u+1]);
}
}
}
/************************ Recv rest *********************/
if (RECV) {
{ // Consume data from first recv
uint64_t* ptr = recvPtr(0)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = SRC ? applyReduce(redOp, vr[u], v[u]) : vr[u];
v[u+1] = SRC ? applyReduce(redOp, vr[u+1], v[u+1]) : vr[u+1];
}
}
for (int i=1; i<MaxRecv && i<fan.nrecv(); i++) {
uint64_t flag = recvFlag(i);
uint64_t* ptr = recvPtr(i)+ll128Offset;
bool needReload;
int spins = 0;
do {
needReload = false;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
vr[u] = __builtin_nontemporal_load(ptr+u*WARP_SIZE);
vr[u+1] = __builtin_nontemporal_load(ptr+u*WARP_SIZE+1);
needReload |= flagThread && (vr[u+1] != flag);
}
needReload &= (0 == checkAbort(spins, i, 0));
} while (__any(needReload));
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = applyReduce(redOp, vr[u], v[u]);
v[u+1] = applyReduce(redOp, vr[u+1], v[u+1]);
}
}
}
/********************** End Recv ************************/
if (postOp) {
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
v[u] = applyPostOp(redOp, v[u]);
v[u+1] = applyPostOp(redOp, v[u+1]);
}
}
#if RCCL_USE_WBINVL1_VOL
if (tid == 0) __builtin_amdgcn_buffer_wbinvl1();
#endif
/************************ Send **************************/
if (SEND) {
for (int i=1; i<MaxSend && i<fan.nsend(); i++) {
uint64_t flag = sendFlag(i);
uint64_t* ptr = sendPtr(i)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
__builtin_nontemporal_store(v[u], ptr+u*WARP_SIZE);
__builtin_nontemporal_store(flagThread ? flag : v[u+1], ptr+u*WARP_SIZE+1);
}
}
uint64_t flag = sendFlag(0);
uint64_t* ptr = sendPtr(0)+ll128Offset;
#pragma unroll
for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
__builtin_nontemporal_store(v[u], ptr+u*WARP_SIZE);
__builtin_nontemporal_store(flagThread ? flag : v[u+1], ptr+u*WARP_SIZE+1);
}
}
/********************** End Send ************************/
}
static constexpr int WireWordPerSlice = WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD;
static constexpr int DataEltPerSlice = (WireWordPerSlice - WireWordPerSlice/NCCL_LL128_LINEELEMS)*(sizeof(uint64_t)/sizeof(T));
template <int RECV, int SEND, int SrcBuf, int DstBuf>
__device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
constexpr int SRC = SrcBuf != -1 ? 1 : 0;
constexpr int DST = DstBuf != -1 ? 1 : 0;
T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
T *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx;
int wireOffset = WireWordPerSlice*warp + 2*wid;
const int nwarps = nthreads/WARP_SIZE;
nelem = nelem < 0 ? 0 : nelem;
if (SEND) waitSend(divUp(nelem, DataEltPerSlice)*WireWordPerSlice*sizeof(uint64_t));
barrier();
nelem -= DataEltPerSlice*warp;
srcPtr += DataEltPerSlice*warp;
dstPtr += DataEltPerSlice*warp;
while (nelem > 0) {
const int eltInSlice = min(nelem, DataEltPerSlice);
uint64_t regs[NCCL_LL128_SHMEM_ELEMS_PER_THREAD];
if (SRC) loadRegsBegin(regs, srcPtr, eltInSlice);
recvReduceSendCopy<NCCL_LL128_SHMEM_ELEMS_PER_THREAD, RECV, SEND, SrcBuf, DstBuf>(regs, wireOffset, postOp);
if (DST) storeRegs(dstPtr, regs, eltInSlice);
wireOffset += WireWordPerSlice*nwarps;
srcPtr += DataEltPerSlice*nwarps;
dstPtr += DataEltPerSlice*nwarps;
nelem -= DataEltPerSlice*nwarps;
}
barrier();
if (SEND) for (int i=0; i < MaxSend; i++) sendStep[i] += 1;
if (SEND) postSend();
if (RECV) for (int i=0; i < MaxRecv; i++) recvStep[i] += 1;
if (RECV) postRecv();
}
template <int REDUCE, int COPY, int MULTISRCS, int MULTIDSTS>
__device__ __forceinline__ void mscclGenericOp(T** srcs, int nsrcs, T** dsts, int ndsts, int nelem) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
T const *srcPtr = srcs[0];
T *dstPtr = dsts[0];
int wireOffset = WireWordPerSlice*warp + 2*wid;
const int nwarps = nthreads/WARP_SIZE;
nelem = nelem < 0 ? 0 : nelem;
nelem -= DataEltPerSlice*warp;
srcPtr += DataEltPerSlice*warp;
dstPtr += DataEltPerSlice*warp;
if (MULTISRCS){
for (int i = 1; i < nsrcs; i++){
srcs[i] += DataEltPerSlice*warp;
}
}
if (MULTIDSTS){
for (int i = 1; i < ndsts; i++){
dsts[i] += DataEltPerSlice*warp;
}
}
while (nelem > 0) {
const int eltInSlice = min(nelem, DataEltPerSlice);
uint64_t regs[NCCL_LL128_SHMEM_ELEMS_PER_THREAD];
loadRegsBegin(regs, srcPtr, eltInSlice);
loadRegsFinish(regs);
if (REDUCE){
uint64_t regsD[NCCL_LL128_SHMEM_ELEMS_PER_THREAD];
loadRegsBegin(regsD, dstPtr, eltInSlice);
loadRegsFinish(regsD);
#pragma unroll
for (int u=0; u<NCCL_LL128_SHMEM_ELEMS_PER_THREAD; u+=2) {
regsD[u] = applyReduce(redOp, regs[u], regsD[u]);
if (!flagThread)
regsD[u+1] = applyReduce(redOp, regs[u+1], regsD[u+1]);
}
if (MULTISRCS){
for (int i = 1; i < nsrcs; i++){
loadRegsBegin(regs, srcs[i], eltInSlice);
loadRegsFinish(regs);
for (int u=0; u<NCCL_LL128_SHMEM_ELEMS_PER_THREAD; u+=2) {
regsD[u] = applyReduce(redOp, regs[u], regsD[u]);
if (!flagThread)
regsD[u+1] = applyReduce(redOp, regs[u+1], regsD[u+1]);
}
}
}
storeRegs(dstPtr, regsD, eltInSlice);
}
if (COPY){
storeRegs(dstPtr, regs, eltInSlice);
if (MULTIDSTS){
for (int i = 1; i < nsrcs; i++){
loadRegsBegin(regs, srcs[i], eltInSlice);
loadRegsFinish(regs);
storeRegs(dsts[i], regs, eltInSlice);
}
}
}
wireOffset += WireWordPerSlice*nwarps;
srcPtr += DataEltPerSlice*nwarps;
dstPtr += DataEltPerSlice*nwarps;
if (MULTISRCS){
for (int i = 1; i < nsrcs; i++){
srcs[i] += DataEltPerSlice*nwarps;
}
}
if (MULTIDSTS){
for (int i = 1; i < ndsts; i++){
dsts[i] += DataEltPerSlice*nwarps;
}
}
nelem -= DataEltPerSlice*nwarps;
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
barrier();
}
__device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
recvBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128];
recvStep[i] = conn->step;
if (wid == i) recvConn = conn;
}
__device__ __forceinline__ void loadRecvSync() {
if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) {
recvConnHeadPtr = recvConn->head;
recvConnHead = recvConn->step;
}
}
__device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
sendBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128];
sendStep[i] = conn->step;
if (wid == i) sendConn = conn;
}
__device__ __forceinline__ void loadSendSync() {
if (tid < fan.nsend()) {
sendConnHeadPtr = sendConn->head;
sendConnHeadCache = *sendConnHeadPtr;
sendConnHead = sendConn->step;
sendConnFifoPtr = sendConn->sizesFifo;
}
if (tid >= nthreads-WARP_SIZE && wid<fan.nsend()) {
if (sendConn->sizesFifo) {
sendConnTailPtr = sendConn->tail;
sendConnTail = sendConn->step;
}
}
}
public:
__device__ Primitives(
const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv=0, uint8_t connIndexSend=0
):
redOp(redOpArg),
tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
warpInBlock(threadIdx.x/WARP_SIZE),
flagThread((tid%4)==3), group(group),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
auto *channel = &ncclShmem.channel;
barriers = &ncclShmem.groups[group].barrier;
barrier_next = ncclShmem.groups[group].barrier_next;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
nrecv++;
}
while (nsend < MaxSend && sendPeers[nsend] >= 0) {
loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
nsend++;
}
this->fan = Fan(nrecv, nsend);
loadRecvSync();
loadSendSync();
setDataPtrs(inputBuf, outputBuf);
}
__device__ ~Primitives() {
// Save steps for the next operation
if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv())
recvConn->step = recvConnHead;
if (tid < fan.nsend())
sendConn->step = sendConnHead;
// Ensure all steps written back
barrier();
}
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf) {
userBufs[Input] = (T*)inputBuf;
userBufs[Output] = (T*)outputBuf;
}
__device__ void moveDataPtrs(intptr_t delta) {
userBufs[Input] += delta;
userBufs[Output] += delta;
}
__device__ void send(intptr_t inpIx, int eltN) {
return GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
}
__device__ void sendFromOutput(intptr_t outIx, int eltN) {
return GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
}
__device__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
return GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ void recvReduceSend(intptr_t inpIx, int eltN) {
return GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
}
__device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
return GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
return GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
return GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
return GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ void recvSend(int eltN) {
return GenericOp<1, 1, -1, -1>(-1, -1, eltN, false);
}
// MSCCL primitives
__device__ void sendWithBarrier(intptr_t inpIx, int eltN) {
send(inpIx, eltN);
}
__device__ void localCopy(T* srcs, T* dsts, int eltN) {
return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
}
__device__ void reduce(T** srcs, int nsrcs, T** dsts, int ndsts, int eltN) {
if (nsrcs == 1) {
return mscclGenericOp<1,0,0,0>(srcs, 1, dsts, 1, eltN);
} else {
return mscclGenericOp<1,0,1,0>(srcs, nsrcs, dsts, 1, eltN);
}
}
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
#endif
#include "msccl/msccl_struct.h"
#ifdef HYGON_SDMA_FEATURE
#include "op128.h"
//#define GC_COPY_DATA
typedef enum {
SDMA_TRANS_MODE_NON = 0,
SDMA_TRANS_MODE_REDUCE_SEND = 1,
SDMA_TRANS_MODE_SRC_SEND = 2,
SDMA_TRANS_MODE_RECV_SEND = 3
} sdma_trans_mode_t;
template<typename T, int BytePerPack>
__device__ __forceinline__ void memsetData(int ringIx, uint64_t srcAddr, uint32_t dataSize, int val)
{
BytePack<BytePerPack> setVal;
if (srcAddr == 0 || dataSize == 0) {
PRINT_ERR("memsetData error input, ringIx:%d bid:%d copy data srcAddr:0x%lx len %d\n", ringIx, (int)blockIdx.x, srcAddr, dataSize);
return;
}
setVal.native = val;
for (int i = 0; i < dataSize; i++) {
st_global<BytePerPack>(srcAddr, setVal);
srcAddr += BytePerPack;
}
return;
}
template<typename T, int BytePerPack>
__device__ __forceinline__ int compareData(int ringIx, uint64_t srcAddr, uint64_t dstAddr, uint32_t dataSize)
{
BytePack<BytePerPack> srcVal;
BytePack<BytePerPack> dstVal;
int miscompare = 0;
if (srcAddr == 0 || dstAddr == 0 || dataSize == 0) {
PRINT_ERR("compareData error input, ringIx:%d bid:%d copy data srcAddr:0x%lx dstAddr:0x%lx len %d\n", ringIx, (int)blockIdx.x, srcAddr, dstAddr, dataSize);
return 0;
}
for (int i = 0; i < dataSize; i++) {
srcVal = ld_volatile_global<BytePerPack>(srcAddr);
dstVal = ld_volatile_global<BytePerPack>(dstAddr);
if (srcVal.native != dstVal.native) {
PRINT_INFO("compareData, ringIx:%d bid:%d miscompare index:%d srcVal[0x%lx]:%d dstVal[0x%lx]:%d\n",
ringIx, (int)blockIdx.x, i, srcAddr, srcVal.native, dstAddr, dstVal.native);
miscompare++;
}
srcAddr += BytePerPack;
dstAddr += BytePerPack;
}
if (miscompare) {
PRINT_INFO("compareData end error, ringIx:%d bid:%d miscompare count:%d dataSize:%d \n", ringIx, (int)blockIdx.x, miscompare, dataSize);
} else {
PRINT_INFO("compareData end ok, ringIx:%d bid:%d same data, last:%d\n", ringIx, (int)blockIdx.x, srcVal.native);
}
return miscompare;
}
template<typename T, int BytePerPack>
__device__ __forceinline__ int startSdmaTask(struct sdmaQueueInfo *sdmaQueue, int ringIx, uint64_t srcAddr, uint64_t dstAddr, uint32_t dataLen)
{
if (srcAddr == 0 || dstAddr == 0) {
PRINT_ERR("startSdma error input, ringIx:%d bid:%d srcAddr:%p dstAddr:%p len:%d\n", ringIx, (int)blockIdx.x, srcAddr, dstAddr, dataLen);
return -1;
}
#ifdef GC_COPY_DATA
BytePack<BytePerPack> val;
for (int i = 0; i < dataLen / BytePerPack; i++) {
val = ld_volatile_global<BytePerPack>(srcAddr);
st_global<BytePerPack>(dstAddr, val);
srcAddr += BytePerPack;
dstAddr += BytePerPack;
}
return 0;
#endif
uint32_t sdmaIndex = atomicAdd(sdmaQueue->pkgIndex, 1) % sdmaQueue->sdmaDepth;
volatile hsa_sdma_info_t *sdmaInfo = &sdmaQueue->sdmaInfo[sdmaIndex];
if (*sdmaInfo->wptr == *sdmaInfo->rptr) {
PRINT_ERR("ringIx:%d bid:%d sdma pkg is empty\n", ringIx, (int)blockIdx.x);
}
sdmaInfo->completion_signal = 1;
sdmaInfo->src_addr = srcAddr;
sdmaInfo->dst_addr = dstAddr;
sdmaInfo->data_size = dataLen;
//sdmaInfo->flag = NPKIT_GET_GPU_TIMESTAMP();
sdmaInfo->dep_signal = 1;
return sdmaIndex;
}
inline __device__ uint64_t waitSdmaTaskComplete(struct sdmaQueueInfo *sdmaQueue, uint32_t sdmaIndex)
{
if (sdmaIndex < 0) {
PRINT_ERR("waitSdmaTaskComplete bid:%d sdmaIndex:%d invalid sdma index \n", (int)blockIdx.x, sdmaIndex);
return 0;
}
#ifdef GC_COPY_DATA
return 0;
#endif
volatile hsa_sdma_info_t *sdmaInfo = &sdmaQueue->sdmaInfo[sdmaIndex];
while (sdmaInfo->completion_signal) {
__builtin_amdgcn_s_sleep(1);
}
__asm__ __volatile__("s_wakeup");
return sdmaInfo->end_ts - sdmaInfo->start_ts;
}
#endif
template<typename T, typename RedOp, typename Fan, int Direct,
int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
class Primitives<
T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p
> {
static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
static constexpr int Input=0, Output=1;
static constexpr int RoleInput = 0x01,
RoleOutput = 0x02,
RoleWaitRecv = 0x04,
RoleWaitSend = 0x08,
RolePostSend = 0x10,
RolePostRecv = 0x20,
Aborted = 0x40,
OffsFifoEnabled = 0x80,
SizesFifoEnabled = 0x100,
DirectWrite = 0x200,
DirectRead = 0x400,
ThreadsSynced = 0x800,
NvlsMinPolling = 0x1000;
const int tid, tidInBlock;
const int nthreads;
int nworkers;
const int stepSize;
Fan fan;
int index; // Peer index I'm responsible for
int flags;
int group;
uint64_t step;
int *connOffsFifoPtr; // (flags & OffsFifoEnabled)
union {
T *userBuff; // (flags & (RoleInput|RoleOutput))
T *connEltsFifo; // !(flags & (RoleInput|RoleOutput))
};
union {
int volatile *connSizesFifoPtr; // (flags & SizesFifoEnabled)
T *directBuff; // !(flags & SizesFifoEnabled)
};
uint64_t *connStepPtr;
uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
uint64_t* barriers;
uint64_t* barrier_next;
uint32_t* next_hdp_reg;
#ifdef HYGON_SDMA_FEATURE
public:
uint32_t ringIx;
uint32_t useSdmaCopy;
uint32_t sdmaMinCopySize;
uint32_t sdmaCountEnabe;
uint32_t sdmaCopyCount;
uint32_t allCopyCount;
private:
#endif
#if defined(ENABLE_NPKIT)
public:
int npKitCtxIdx = 0;
uint64_t npKitDataProcessEntryTime = 0;
uint64_t npKitDataProcessExitTime = 0;
uint64_t npKitDataProcessTotalTime = 0;
private:
#endif
// Don't use barrier 0 as it's used by the final sync
inline __device__ void barrier() {
flags |= ThreadsSynced;
if (nthreads == WARP_SIZE)
__syncwarp();
else
barrier_by_group();
}
inline __device__ void subBarrier() {
barrier();
}
inline __device__ bool checkAbort(int &spins) {
spins++;
if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
if (atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0)) {
flags |= Aborted;
ncclShmem.aborted = 1;
}
spins = 0;
}
return flags & Aborted;
}
inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
if (flags & NvlsMinPolling) {
uint64_t ans;
asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
return ans;
}
#endif
// volatile is faster than acquire but not as correct. Make sure reduceCopy
// loads data using volatile so it doesn't see stale data in L1.
#ifdef __GFX9__
return atomicAdd((unsigned long long *)ptr, 0);
#else
return __atomic_load_n(ptr, __ATOMIC_SEQ_CST);
#endif
}
template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
__device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
const bool noRecvWait = DirectRecv && Src && (flags & DirectRead); // no wait when directly reading from remote input
const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
((flags & (Send*RoleWaitSend)) && !noSendWait)) {
int spins = 0;
while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
__builtin_amdgcn_s_sleep(1);
connStepCache = loadStepValue(connStepPtr);
if (checkAbort(spins)) break;
//if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
}
__asm__ __volatile__("s_wakeup");
}
if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
if (isSendNotRecv && (flags & SizesFifoEnabled))
__atomic_store_n(connSizesFifoPtr+step%NCCL_STEPS, nelts*sizeof(T), __ATOMIC_SEQ_CST);
void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
: (ncclShmem.groups[group].srcs + Src);
if (flags & OffsFifoEnabled)
ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
else if (isSendNotRecv && DirectSend) {
if (flags & DirectWrite) {
ptrs[index] = directBuff + dstIx + offset;
} else if (flags & DirectRead) { // empty send
ptrs[index] = nullptr;
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
} else if (!isSendNotRecv && DirectRecv) {
if (flags & DirectRead) {
ptrs[index] = directBuff + srcIx + offset;
} else if (flags & DirectWrite) {
ptrs[index] = directBuff + dstIx + offset; // send to next from my output buffer
} else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
}
else {
ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
}
step += StepPerSlice;
}
}
template<int Recv, int Send>
inline __device__ void postPeer(bool dataStored) {
if (Send && (flags & RolePostSend) && dataStored)
#ifdef __GFX9__
__builtin_amdgcn_buffer_wbinvl1();
#else
__threadfence_system();
#endif
if ((flags & Send*RolePostSend) && next_hdp_reg)
STORE((unsigned int *)next_hdp_reg, 0x1);
if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
step += StepPerSlice;
STORE(connStepPtr, step);
}
}
template <int DirectRecv1, int DirectSend1, int Recv, int Send, int SrcBuf, int DstBuf>
__device__ __forceinline__ void genericOp(
intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp
) {
constexpr int DirectRecv = 1 && Direct && DirectRecv1;
constexpr int DirectSend = 1 && Direct && DirectSend1;
constexpr int Src = SrcBuf != -1;
constexpr int Dst = DstBuf != -1;
nelem = nelem < 0 ? 0 : nelem;
int sliceSize = stepSize*StepPerSlice;
sliceSize = max(divUp(nelem, 16*SlicePerChunk)*16, sliceSize/32);
int slice = 0;
int offset = 0;
#ifdef HYGON_SDMA_FEATURE
uint64_t srcSdmaAddr = 0;
uint64_t dstSdmaAddr = 0;
int sendToNextRankMode = SDMA_TRANS_MODE_NON;
int needSdmaCopy = 0;
if (useSdmaCopy) {
if (Send == 1 && fan.nsend() == 1) {
if (Src && Recv) {
sendToNextRankMode = SDMA_TRANS_MODE_REDUCE_SEND; // 1
} else if (Src && !Recv) {
sendToNextRankMode = SDMA_TRANS_MODE_SRC_SEND; // 2
} else if (!Src && Recv) {
sendToNextRankMode = SDMA_TRANS_MODE_RECV_SEND; // 3
}
if (sendToNextRankMode) needSdmaCopy = 1;
}
}
#endif
PRINT_DEBUG("genericOp-1- ringIx:%d bid:%d sliceSize:%d nelem:%d SlicePerChunk:%d stepSize:%d StepPerSlice:%d slicesize0:%d max(val1:%d val2:%d) mode:%d send:%d %d sizeofT:%d\n",
ringIx, (int)blockIdx.x, sliceSize*sizeof(T), nelem*sizeof(T), SlicePerChunk, stepSize*sizeof(T), StepPerSlice, stepSize*StepPerSlice*sizeof(T),
divUp(nelem, 16*SlicePerChunk)*16*sizeof(T), sizeof(T)*stepSize*StepPerSlice/32, sendToNextRankMode, Send, MaxSend, sizeof(T));
if (tid < nworkers && offset < nelem) {
// Worker-only loop for non-empty slices. Non-workers and empty slices are
// processed in the loop following this if block. The benefit of splitting
// the loop like this is we pull two branches out of the critical path.
// Using "number of branch insns (taken or not) encountered dynamically"
// as the performance metric, then:
// perf_orig = 2*numslices
// perf_new = 2+numslices
// So the new code and old code behave the same for numslices=2, and for
// numslices>2 the new code is superior. And note that in the case
// numslices=1, the loop is trivially unrollable (single iteration) so we
// don't incur that that tail branch and we still have perf_new=2.
//
// ORIGINAL CODE:
// unrolled for(slices) {
// if(worker) { // This branch removed
// wait();
// subBarrier();
// if(slice not empty) // This branch removed
// ReduceCopyMulti();
// }
// barrier();
// post();
// } // Since we no longer unroll, new branch added here
#pragma unroll 1
do {
sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput)))
ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(srcIx, dstIx, offset, sliceSize);
subBarrier();
/* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
* to 0 to avoid unnecessary workload. */
int workSize = ncclShmem.aborted ? 0 : sliceSize;
#ifdef HYGON_SDMA_FEATURE
if (tid == 0 && sdmaCountEnabe) allCopyCount++;
if (needSdmaCopy && workSize*sizeof(T) < sdmaMinCopySize) {
sendToNextRankMode = 0;
needSdmaCopy = 0;
PRINT_DEBUG("genericOp-sdma- ringIx:%d bid:%d workSize:%d minCopySize:%d\n",
ringIx, (int)blockIdx.x, workSize*sizeof(T), sdmaMinCopySize);
}
if (tid == 0 && sendToNextRankMode && useSdmaCopy) {
// SDMA拷贝源地址是Src或Recv地址,只有Src时Src占用srcs[0],只有Recv时Recv占用srcs[0],同时有时,Src占用srcs[0],Recv占用srcs[1]
srcSdmaAddr = (uint64_t)ncclShmem.groups[group].srcs[Src*Recv];
// SDMA拷贝目的地址是Send地址,只有Dst时Dst占用dsts[0],只有Send时Send占用dsts[0],同时有时,Dst占用dsts[0],Send占用dsts[1]
dstSdmaAddr = (uint64_t)ncclShmem.groups[group].dsts[Dst];
if (sendToNextRankMode == SDMA_TRANS_MODE_REDUCE_SEND) {
// 同时有Src和Recv时,将Reduce计算后数据保存目的地址设置为srcs[1],srcs[1]是作为SDMA拷贝的源地址
ncclShmem.groups[group].dsts[Dst] = ncclShmem.groups[group].srcs[Src*Recv];
}
}
if (useSdmaCopy) subBarrier();
#endif
if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
// We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
if (Send) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
(tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
1, ncclShmem.groups[group].srcs,
fan.nsend(), ncclShmem.groups[group].dsts+1,
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
} else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
// For broadcast in CollNet to do empty send
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs*/0>
(tid, nworkers, ncclShmem.redOpArgs[0], nullptr, postOp,
Recv, ncclShmem.groups[group].srcs,
Dst, ncclShmem.groups[group].dsts,
workSize);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
} else {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
PRINT_DEBUG("genericOp-4-ringIx:%d bid:%d MaxRecv:%d MaxSend:%d PreOpSrcs:%d nworkers:%d Recv:%d fan.nrecv:%d Src:%d Send:%d fan.nsend:%d Dst:%d "
"workSize:%d group:%d Unroll:%d mode:%d sdma:%d sq:%p src:0x%lx dst:0x%lx mins:%d\n",
ringIx, (int)blockIdx.x, MaxRecv, MaxSend, PreOpSrcs, nworkers, Recv, fan.nrecv(), Src, Send, fan.nsend(), Dst, workSize*sizeof(T), group, Unroll,
sendToNextRankMode, useSdmaCopy, ncclShmem.channel.sdmaQueue.sdmaInfo, srcSdmaAddr, dstSdmaAddr, sdmaMinCopySize);
#ifdef HYGON_SDMA_FEATURE
if (sendToNextRankMode <= SDMA_TRANS_MODE_REDUCE_SEND) {
reduceCopy<Unroll, RedOp, T,
MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
workSize);
} else if (Dst) {
uint32_t sdmaIndex;
uint64_t delta_ts;
if (tid == 0) {
NPKIT_SET_GPU_EVENT(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_PAL_ENTRY, workSize*sizeof(T), 0);
sdmaIndex = startSdmaTask<T, sizeof(T)>(&ncclShmem.channel.sdmaQueue, ringIx, srcSdmaAddr, dstSdmaAddr, workSize * sizeof(T));
}
reduceCopy<Unroll, RedOp, T,
MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
MultimemDsts, SDMA_SPEC_DST, Dst, PreOpSrcs>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
Dst, ncclShmem.groups[group].dsts,
workSize);
needSdmaCopy = 0;
if (tid == 0) {
delta_ts = waitSdmaTaskComplete(&ncclShmem.channel.sdmaQueue, sdmaIndex);
#if defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST)
NPKIT_SET_GPU_EVENT_TM(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_ENTRY, workSize * sizeof(T), 0, ncclShmem.channel.sdmaQueue.sdmaInfo[sdmaIndex].start_ts);
NPKIT_SET_GPU_EVENT_TM(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_EXIT, workSize * sizeof(T), 0, ncclShmem.channel.sdmaQueue.sdmaInfo[sdmaIndex].end_ts);
#endif
NPKIT_SET_GPU_EVENT(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_PAL_EXIT, workSize*sizeof(T), delta_ts);
if (sdmaCountEnabe) sdmaCopyCount++;
}
}
#else
reduceCopy<Unroll, RedOp, T,
MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
workSize);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
if (tid == 0) {
npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
barrier(); // This barrier has a counterpart in following loop
#ifdef HYGON_SDMA_FEATURE
if (tid == 0 && sendToNextRankMode && useSdmaCopy && needSdmaCopy) {
PRINT_DEBUG("genericOp-5-ringIx:%d bid:%d MaxRecv:%d MaxSend:%d Recv:%d fan.nrecv:%d Src:%d Send:%d fan.nsend:%d Dst:%d "
"workSize:%d mode:%d sdma:%d need:%d sq:%p src:0x%lx dst:0x%lx\n",
ringIx, (int)blockIdx.x, MaxRecv, MaxSend, Recv, fan.nrecv(), Src, Send, fan.nsend(), Dst, workSize*sizeof(T),
sendToNextRankMode, useSdmaCopy, needSdmaCopy, ncclShmem.channel.sdmaQueue.sdmaInfo, srcSdmaAddr, dstSdmaAddr);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
#endif
NPKIT_SET_GPU_EVENT(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_ENTRY, workSize*sizeof(T), 0);
uint32_t sdmaIndex = startSdmaTask<T, sizeof(T)>(&ncclShmem.channel.sdmaQueue, ringIx, srcSdmaAddr, dstSdmaAddr, workSize * sizeof(T));
uint64_t delta_ts = waitSdmaTaskComplete(&ncclShmem.channel.sdmaQueue, sdmaIndex);
#if defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST)
NPKIT_SET_GPU_EVENT_TM(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_ENTRY, workSize * sizeof(T), 0, ncclShmem.channel.sdmaQueue.sdmaInfo[sdmaIndex].start_ts);
NPKIT_SET_GPU_EVENT_TM(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_EXIT, workSize * sizeof(T), 0, ncclShmem.channel.sdmaQueue.sdmaInfo[sdmaIndex].end_ts);
#endif
NPKIT_SET_GPU_EVENT(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_EXIT, workSize*sizeof(T), delta_ts);
if (sdmaCountEnabe) sdmaCopyCount++;
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
#endif
}
if (useSdmaCopy) barrier();
#endif
postPeer<Recv, Send>(0 < sliceSize);
offset += sliceSize;
slice += 1;
} while (slice < SlicePerChunk && offset < nelem);
}
// Non-workers come straight here. Workers too but only once the remaining
// slices are all empty. Since empty slices are the uncommon case, and
// worker perf is the limiter, perf-wise this loop is effectively unentered,
// hence just a single branch insn.
#pragma unroll 1
while (slice < SlicePerChunk) {
sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
{ // Only workers could have Wait roles so we know the slice must be empty
// since we've exited the loop above.
waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
}
barrier(); // Has couterpart in preceding worker-only loop.
postPeer<Recv, Send>(0 < sliceSize);
offset += sliceSize;
slice += 1;
}
}
template <int REDUCE, int COPY, int MULTISRCS, int MULTIDSTS>
__device__ __forceinline__ void mscclGenericOp(T** srcs, int nsrcs, T** dsts, int ndsts, int nelem) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
nelem = nelem < 0 ? 0 : nelem;
if (tid < nworkers) {
if (REDUCE){
srcs[nsrcs] = dsts[0];
nsrcs++;
if (MULTISRCS){
reduceCopy<Unroll, RedOp, T, 0, 3, MSCCL_MAX_REDUCE_FUSION, 0, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, nsrcs, (void **)srcs, 1, (void **)dsts, nelem);
} else {
reduceCopy<Unroll, RedOp, T, 0, 2, 2, 0, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 2, (void **)srcs, 1, (void **)dsts, nelem);
}
}
if (COPY){
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)srcs, 1, (void **)dsts, nelem);
if (MULTISRCS) {
for (int i = 1; i < nsrcs; i++){
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, 0>
(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)&srcs[i], 1, (void **)&dsts[i], nelem);
}
}
}
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT)
if (tid == 0) {
NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
barrier();
}
// Scatter/Gather generic op
// skip: my own rank order in the buffer chunks
// shift: peer offset to avoid all ranks sending to or receiving from same peer
template <int DirectRecv1, int DirectSend1, int Recv, int Send>
__device__ __forceinline__ void
ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) {
constexpr int DirectRecv = 1 && Direct && DirectRecv1;
constexpr int DirectSend = 1 && Direct && DirectSend1;
int offset = 0; // slice offset
int sliceSize = stepSize*StepPerSlice;
int dataSize = max(DIVUP(peerElem, 16*SlicePerChunk)*16, sliceSize/32); // per-peer slice size
#pragma unroll 1
for (int slice=0; slice<SlicePerChunk; ++slice) {
int realSize = max(0, min(dataSize, peerElem-offset));
bool fenceNeeded = false;
if (tid < nworkers) {
if (Send) {
// Scatter pre-scales data of input buffer only in non-Direct case
constexpr int PreOpSrcs = DirectSend ? 0 : 1;
if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
// realSize is not accurate here; but intra-node does not rely on sizes FIFO
waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
subBarrier();
#pragma unroll 1
// Loop over peers
for (int j=0; j<fan.nsend(); j++) {
int i = (j+shift)%fan.nsend();
int pOffset = i*peerOffset;
// Skip the data I am responsible of reducing myself
if (skip >= 0 && i >= skip) pOffset += peerElem;
void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
// Mark for threadfence at the end
fenceNeeded |= true;
}
}
} else if (Recv) {
if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
int pOffset = index*peerOffset;
if (skip >= 0 && index >= skip) pOffset += peerElem;
// Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
subBarrier();
#pragma unroll 1
for (int j=0; j<fan.nrecv(); j++) {
int i = (j+shift)%fan.nrecv();
pOffset = i*peerOffset;
if (skip >= 0 && i >= skip) pOffset += peerElem;
void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
int realPeerSize = min(realSize, totalElem-pOffset);
if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
if (realPeerSize > 0) reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
}
}
}
fenceNeeded = __any(fenceNeeded);
postPeer<Recv, Send>(fenceNeeded);
offset += realSize;
}
}
__device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
if (flags & (RoleWaitRecv|RolePostRecv)) {
auto *conn = &peer->recv[connIndex];
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostRecv) {
connStepPtr = conn->head;
STORE(connStepPtr, step); // Return credits in case we rounded up.
}
if (flags & RoleWaitRecv) {
ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->tail;
connStepCache = loadStepValue(connStepPtr);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
if (Direct) {
// User buffers have been registered
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
}
}
if (flags & OffsFifoEnabled)
connOffsFifoPtr = conn->offsFifo;
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
}
}
}
__device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
if (flags & (RoleWaitSend|RolePostSend)) {
auto *conn = &peer->send[connIndex];
step = conn->step;
step = roundUp(step, SlicePerChunk*StepPerSlice);
if (flags & RolePostSend) {
connStepPtr = conn->tail;
next_hdp_reg = conn->next_hdp_reg;
}
if (flags & RoleWaitSend) {
ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
connStepPtr = conn->head;
connStepCache = loadStepValue(connStepPtr);
flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
if (flags & OffsFifoEnabled)
connOffsFifoPtr = conn->offsFifo;
connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
if (conn->sizesFifo != nullptr) {
flags |= SizesFifoEnabled;
connSizesFifoPtr = conn->sizesFifo;
} else if (Direct) {
// User buffers have been registered
if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
(e->direct & NCCL_DIRECT_READ) ? DirectRead : 0;
}
} else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
if (connIndex == 1 && P2p == 0) {
flags |= DirectRead; // scatter-reduce use direct pull
} else {
// direct read not allowed in non-register case
// otherwise, in one-to-multi send, we could mix empty send and intermediate send
flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
}
}
}
}
}
}
public:
__forceinline__ __device__ Primitives(
int tid, int nthreads, int const *recvPeers, int const *sendPeers,
void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr
):
tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
// For send operations, we need an extra warp to overlap the threadfence and the copy
barriers = &ncclShmem.groups[group].barrier;
barrier_next = ncclShmem.groups[group].barrier_next;
this->nworkers = nthreads;
int nrecv=0, nsend=0;
while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
this->fan = Fan(nrecv, nsend);
constexpr int ThreadPerSync = 8;
static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
int g = tid / ThreadPerSync;
int ng = nthreads / ThreadPerSync;
index = tid % ThreadPerSync;
flags = 0;
if (g == 0) {
if (index < nrecv) flags |= RoleWaitRecv;
if (index == nrecv) flags |= RoleInput;
} else if (g == 1) {
if (index < nsend) flags |= RoleWaitSend;
if (index == nsend) flags |= RoleOutput;
} else if (g == ng - 2) {
if (index < nrecv) flags |= RolePostRecv;
} else if (g == ng - 1) {
if (index < nsend) flags |= RolePostSend;
}
int peer = 0;
if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
}
__forceinline__ __device__ ~Primitives() {
// Ensure ncclShmem.groups[].send/recvConns are available
if (!(flags & ThreadsSynced))
barrier();
// Save steps for the next operation
if (flags & (RolePostSend|RolePostRecv)) {
auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
conns[index]->step = step;
}
// Make sure all threads are done writing back conn->step and done using
// ncclShmem.groups[group]
barrier();
}
__device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
if (flags & RoleInput) {
userBuff = (T*)inputBuf;
ncclShmem.redOpArgs[0] = redOpArg; // scaler for local input
}
if (flags & RoleOutput) userBuff = (T*)outputBuf;
bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
bool sendAcceptor = flags == (flags|RoleWaitSend|DirectWrite);
bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead); // receiver accepts direct buffer
int regUsed = e != nullptr ? e->elem.regUsed : 0;
if (Direct && recvProvider) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
// Wait for consumer to consume previous value before trampling it.
while ((void *)atomicAdd((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
directBuff = (T*)outputBuf;
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
}
if (Direct && sendAcceptor) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
void *ptr;
while (true) {
ptr = (void *)atomicAdd((unsigned long long *) slot,0);
if (ptr != nullptr || checkAbort(spins)) break;
}
directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
*slot = nullptr;
}
if (Direct && sendProvider) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
// Wait for consumer to consume previous value before trampling it.
while (((void *)atomicAdd((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
// If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
// Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
// Exchange pre-scalers for use in direct pull
*argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
*argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
// Encode pointer by XOR'ing against some address they definitely wouldn't send
// since we want to allow them sending us nullptr while not colliding with
// the empty slot value.
*slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
}
if (Direct && recvAcceptor) {
int spins = 0;
void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
void *ptr;
while (true) {
ptr = (void *)atomicAdd((unsigned long long *) slot,0);
if (ptr != nullptr || checkAbort(spins)) break;
}
directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
if (MaxSend != 0) { // reduce group rather than gather group
// Store scalers for remote inputs
uint64_t arg0, arg1;
while (true) {
arg0 = *argSlot0;
arg1 = *argSlot1;
if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
}
ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
}
*argSlot0 = 0; *argSlot1 = 0;
*slot = nullptr;
}
}
__device__ void moveDataPtrs(intptr_t delta) {
if (flags & (RoleInput|RoleOutput))
userBuff += delta;
}
// Set MSCCL data pointers
__device__ __forceinline__ void setDataPtrs(void const *inputBuf, void *outputBuf) {
if (flags & RoleInput) userBuff = (T*)inputBuf;
if (flags & RoleOutput) userBuff = (T*)outputBuf;
}
__device__ __forceinline__ void send(intptr_t inpIx, int eltN) {
genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, eltN, false);
}
__device__ __forceinline__ void sendFromOutput(intptr_t outIx, int eltN) {
genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, eltN, false);
}
__device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
genericOp<0, 1, 0, 1, Input, -1>(inpIx, outIx, eltN, false);
}
__device__ __forceinline__ void directSendFromOutput(intptr_t outIx, int eltN) {
genericOp<0, 1, 0, 1, Output, -1>(outIx, outIx, eltN, false);
}
__device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
}
__device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvSend(int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, -1>(-1, -1, eltN, postOp);
}
__device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
}
__device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
}
__device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
// Direct is only for the send part
genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
}
__device__ __forceinline__ void
scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
__device__ __forceinline__ void
directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
__device__ __forceinline__ void
gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) {
ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp);
}
__device__ __forceinline__ void
directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
}
// MSCCL primitives
__device__ __forceinline__ void sendWithBarrier(intptr_t inpIx, int eltN) {
send(inpIx, eltN);
}
__device__ __forceinline__ void localCopy(T* srcs, T* dsts, int eltN) {
return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
}
__device__ __forceinline__ void reduce(T** srcs, int nsrcs, T** dsts, int ndsts, int eltN) {
if (nsrcs == 1) {
return mscclGenericOp<1,0,0,0>(srcs, 1, dsts, 1, eltN);
} else {
return mscclGenericOp<1,0,1,0>(srcs, nsrcs, dsts, 1, eltN);
}
}
};
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
/*This file is now generated in CMake*/
// #include "reduce.h"
// #include "common.h"
// #include "collectives.h"
// IMPL_COLL_R(Reduce);
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "primitives.h"
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
#endif
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->count;
const int rank = ncclShmem.comm.rank;
const int prevRank = ring->userRanks[nranks-1];
const int root = args->root;
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = ring->index;
INIT_PRIMS_SDMA(prims, args);
#endif
auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
int realChunkSize;
if (Proto::Id == NCCL_PROTO_SIMPLE) {
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
}
else if (Proto::Id == NCCL_PROTO_LL)
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
else if (Proto::Id == NCCL_PROTO_LL128)
realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
return realChunkSize;
};
if (prevRank == root) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = calcChunkSize(gridOffset);
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
prims.send(offset, nelem);
}
}
else if (rank == root) {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = calcChunkSize(gridOffset);
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
}
}
else {
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
int realChunkSize = calcChunkSize(gridOffset);
ssize_t offset = gridOffset + bid*realChunkSize;
int nelem = min(realChunkSize, size-offset);
prims.recvReduceSend(offset, nelem);
}
}
}
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
}
};
/*************************************************************************
* Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL_REDUCE_KERNEL_H_
#define NCCL_REDUCE_KERNEL_H_
#include "op128.h"
#include <limits>
#include <type_traits>
////////////////////////////////////////////////////////////////////////////////
// The reduction function classes. All classes must:
// 1. Expose the `EltType` typedef.
// 2. Have constructor taking no arguments (default constructible).
// 3. Have constructor taking `uint64_t opArg`.
template<typename T>
struct FuncNull { using EltType = T; __device__ FuncNull(uint64_t opArg=0) {}; };
template<typename T>
struct FuncSum { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
template<typename T>
struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
template<typename T>
struct FuncMin { using EltType = T; __device__ FuncMin(uint64_t opArg=0) {}; };
template<typename T>
struct FuncMax { using EltType = T; __device__ FuncMax(uint64_t opArg=0) {}; };
template<typename T> struct FuncPreMulSum;
template<typename T> struct FuncSumPostDiv;
////////////////////////////////////////////////////////////////////////////////
// Trait classes for reduction functions. Given a function (FuncSum, etc.)
// and a number of elements in a pack, will reduce, preOp, or postOp a pack
// of elements. These classes are intended to be specialized for specific
// combinations of reduction function and pack size.
template<typename Fn, int EltPerPack>
struct Apply_Reduce /*{
static BytePack<EltPerPack*sizeof(T)> reduce(
Fn fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b
);
}*/;
template<typename Fn, int EltPerPack>
struct Apply_PreOp/*{
static constexpr bool IsIdentity;
static BytePack<EltPerPack*sizeof(T)> preOp(Fn fn, BytePack<EltPerPack*sizeof(T)> a);
}*/;
template<typename Fn, int EltPerPack>
struct Apply_PostOp/*{
static constexpr bool IsIdentity;
static BytePack<EltPerPack*sizeof(T)> postOp(Fn fn, BytePack<EltPerPack*sizeof(T)> a);
}*/;
template<typename Fn>
struct LoadMultimem_BigPackSize/*{
// If non-zero, then this and sizeof(T) are valid pack sizes for LoadMultimem,
// otherwise there are no valid pack sizes for LoadMultimem.
static constexpr int BigPackSize = 0;
}*/;
template<typename Fn, int BytePerPack>
struct Apply_LoadMultimem/*{
static BytePack<BytePerPack> load(Fn fn, uintptr_t addr);
}*/;
////////////////////////////////////////////////////////////////////////////////
// Public API for calling the trait classes. These take the data elements as a
// pack of any type, which could be a BytePack<?> or any integral type (uint64_t,
// uint32_t, etc.), and will return a new pack where each element has been
// transformed appropriately.
template<typename Fn, typename Pack>
__device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
return fromPack<Pack>(
Apply_Reduce<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
::reduce(fn, toPack(a), toPack(b))
);
}
template<typename Fn, typename Pack>
__device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
return fromPack<Pack>(
Apply_PreOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
::preOp(fn, toPack(a))
);
}
template<typename Fn, typename Pack>
__device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) {
return fromPack<Pack>(
Apply_PostOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
::postOp(fn, toPack(a))
);
}
template<typename Fn, int BytePerPack>
__device__ __forceinline__ BytePack<BytePerPack> applyLoadMultimem(Fn fn, uintptr_t addr) {
return Apply_LoadMultimem<Fn, BytePerPack>::load(fn, addr);
}
////////////////////////////////////////////////////////////////////////////////
// Apply_Reduce
// Nonsensical base case
template<typename Fn>
struct Apply_Reduce<Fn, /*EltPerPack=*/0> {
__device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
return {};
}
};
// General recursive definition (EltPerPack > 1). This is how we iterate over
// all elements in a pack of any size, by breaking it into halves. Eventually
// we'll hit a base case (a more specific template specialization which takes
// precedence).
template<typename Fn, int EltPerPack>
struct Apply_Reduce {
template<int Size>
__device__ static BytePack<Size> reduce(Fn fn, BytePack<Size> a, BytePack<Size> b) {
a.half[0] = Apply_Reduce<Fn, EltPerPack/2>::reduce(fn, a.half[0], b.half[0]);
a.half[1] = Apply_Reduce<Fn, EltPerPack/2>::reduce(fn, a.half[1], b.half[1]);
return a;
}
};
// Base case definitions (EltPerPack == 1)
template<typename T>
struct Apply_Reduce<FuncNull<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
return a;
}
};
template<typename T>
struct Apply_Reduce<FuncSum<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
return toPack<T>(fromPack<T>(a) + fromPack<T>(b));
}
};
template<typename T>
struct Apply_Reduce<FuncProd<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncProd<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
return toPack<T>(fromPack<T>(a) * fromPack<T>(b));
}
};
template<typename T>
struct Apply_Reduce<FuncMin<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncMin<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
return toPack<T>(min(fromPack<T>(a), fromPack<T>(b)));
}
};
template<typename T>
struct Apply_Reduce<FuncMax<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
return toPack<T>(max(fromPack<T>(a), fromPack<T>(b)));
}
};
// Optimizations for specfic types and element count combinations:
template<>
struct Apply_Reduce<FuncSum<uint8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncSum<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
constexpr uint32_t lo = 0x00ff00ff;
constexpr uint32_t hi = ~lo;
uint32_t x = a.u32;
uint32_t y = b.u32;
a.u32 = (((x&lo) + (y&lo))&lo) + (((x&hi) + (y&hi))&hi);
return a;
}
};
template<>
struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncSum<int8_t> fn, BytePack<4> a, BytePack<4> b) {
return Apply_Reduce<FuncSum<uint8_t>, 4>::reduce(FuncSum<uint8_t>(), a, b);
}
};
#if 300 <= __CUDA_ARCH__ && __CUDA_ARCH__ < 500
template<>
struct Apply_Reduce<FuncMin<uint8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncMin<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
uint32_t z=0;
asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
return a;
}
};
template<>
struct Apply_Reduce<FuncMin<int8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncMin<int8_t> fn, BytePack<4> a, BytePack<4> b) {
int32_t z=0;
asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
return a;
}
};
template<>
struct Apply_Reduce<FuncMax<uint8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncMax<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
uint32_t z=0;
asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
return a;
}
};
template<>
struct Apply_Reduce<FuncMax<int8_t>, /*EltPerPack=*/4> {
__device__ static BytePack<4> reduce(FuncMax<int8_t> fn, BytePack<4> a, BytePack<4> b) {
int32_t z=0;
asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
return a;
}
};
#endif
#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_x_y) \
template<> \
struct Apply_Reduce<Fn<T>, EltPerPack> { \
__device__ __forceinline__ static BytePack<sizeof(Vec)> reduce( \
Fn<T> fn, BytePack<sizeof(Vec)> a, BytePack<sizeof(Vec)> b \
) { \
Vec x = fromPack<Vec>(a); \
Vec y = fromPack<Vec>(b); \
return toPack<Vec>(expr_of_x_y); \
} \
};
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y))
SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y))
SPECIALIZE_REDUCE(FuncProd, half, 1, half, __hmul(x, y))
SPECIALIZE_REDUCE(FuncProd, half, 2, half2, __hmul2(x, y))
#else
SPECIALIZE_REDUCE(FuncSum, half, 1, half, __float2half(__half2float(x) + __half2float(y)))
SPECIALIZE_REDUCE(FuncProd, half, 1, half, __float2half(__half2float(x) * __half2float(y)))
#endif
#if __CUDA_ARCH__ >= 800
SPECIALIZE_REDUCE(FuncMin, half, 1, half, __hmin(x, y))
SPECIALIZE_REDUCE(FuncMin, half, 2, half2, __hmin2(x, y))
SPECIALIZE_REDUCE(FuncMax, half, 1, half, __hmax(x, y))
SPECIALIZE_REDUCE(FuncMax, half, 2, half2, __hmax2(x, y))
#else
SPECIALIZE_REDUCE(FuncMin, half, 1, half, __float2half(fminf(__half2float(x), __half2float(y))))
SPECIALIZE_REDUCE(FuncMax, half, 1, half, __float2half(fmaxf(__half2float(x), __half2float(y))))
#endif
#if defined(RCCL_BFLOAT16)
#if __CUDA_ARCH__ >= 800
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __hadd(x, y))
SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y))
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y))
SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y))
SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __hmin(x, y))
SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 2, __nv_bfloat162, __hmin2(x, y))
SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __hmax(x, y))
SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 2, __nv_bfloat162, __hmax2(x, y))
#else
SPECIALIZE_REDUCE(FuncSum, rccl_bfloat16, 1, rccl_bfloat16, (rccl_bfloat16)((float)(x) + (float)(y)))
SPECIALIZE_REDUCE(FuncProd, rccl_bfloat16, 1, rccl_bfloat16, (rccl_bfloat16)((float)(x) * (float)(y)))
SPECIALIZE_REDUCE(FuncMin, rccl_bfloat16, 1, rccl_bfloat16, (rccl_bfloat16)(fminf((float)(x), (float)(y))))
SPECIALIZE_REDUCE(FuncMax, rccl_bfloat16, 1, rccl_bfloat16, (rccl_bfloat16)(fmaxf((float)(x), (float)(y))))
#endif
#endif
#undef SPECIALIZE_REDUCE
////////////////////////////////////////////////////////////////////////////////
// Apply_PreOp
// General recursive definition (EltPerPack > 1)
template<typename Fn, int EltPerPack>
struct Apply_PreOp {
static constexpr bool IsIdentity = Apply_PreOp<Fn, EltPerPack/2>::IsIdentity;
template<int Size>
__device__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
#if __cpp_if_constexpr
if constexpr(!IsIdentity) {
#else
if (!IsIdentity) {
#endif
// The `if (!IsIdentity)` condition is not strictly necessary, but it may help
// compiler in that it won't have to tear a register apart for no reason
// just to put it back together again.
a.half[0] = Apply_PreOp<Fn, EltPerPack/2>::preOp(fn, a.half[0]);
a.half[1] = Apply_PreOp<Fn, EltPerPack/2>::preOp(fn, a.half[1]);
}
return a;
}
};
// Base case definition (EltPerPack == 1), by default is identity function.
template<typename Fn>
struct Apply_PreOp<Fn, /*EltPerPack=*/1> {
static constexpr bool IsIdentity = true;
template<int Size>
__device__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
return a;
}
};
// Base case definition (EltPerPack == 0), is nonsense!
template<typename Fn>
struct Apply_PreOp<Fn, /*EltPerPack=*/0> {
static constexpr bool IsIdentity = true;
__device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) {
return {};
}
};
////////////////////////////////////////////////////////////////////////////////
// Apply_PostOp
// General recursive definition (EltPerPack > 1)
template<typename Fn, int EltPerPack>
struct Apply_PostOp {
static constexpr bool IsIdentity = Apply_PostOp<Fn, EltPerPack/2>::IsIdentity;
template<int Size>
__device__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
#if __cpp_if_constexpr
if constexpr(!IsIdentity) {
#else
if (!IsIdentity) {
#endif
// The `if (!IsIdentity)` condition is not strictly necessary, but it may help
// compiler in that it won't have to tear a register apart for no reason
// just to put it back together again.
a.half[0] = Apply_PostOp<Fn, EltPerPack/2>::postOp(fn, a.half[0]);
a.half[1] = Apply_PostOp<Fn, EltPerPack/2>::postOp(fn, a.half[1]);
}
return a;
}
};
// Base case definition (EltPerPack == 1), by default is identity function.
template<typename Fn>
struct Apply_PostOp<Fn, /*EltPerPack=*/1> {
static constexpr bool IsIdentity = true;
template<int Size>
__device__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
return a;
}
};
// Base case definition (EltPerPack == 0), is nonsense!
template<typename Fn>
struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
static constexpr bool IsIdentity = true;
__device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) {
return {};
}
};
////////////////////////////////////////////////////////////////////////////////
// FuncPreMulSum
// General definition for all integral types, float, and double.
template<typename T>
struct FuncPreMulSum {
using EltType = T;
T scalar;
__device__ FuncPreMulSum(uint64_t opArg=0) {
union { uint64_t u64; T val; };
u64 = opArg;
scalar = val;
}
};
template<>
struct FuncPreMulSum<half> {
using EltType = half;
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
half2 scalar;
__device__ FuncPreMulSum(uint64_t opArg=0) {
union { uint64_t u64; half val; };
u64 = opArg;
scalar.x = val;
scalar.y = val;
}
#else
float scalar;
__device__ FuncPreMulSum(uint64_t opArg=0) {
union { uint64_t u64; half val; };
u64 = opArg;
scalar = __half2float(val);
}
#endif
};
#if defined(RCCL_BFLOAT16)
template<>
struct FuncPreMulSum<rccl_bfloat16> {
using EltType = rccl_bfloat16;
#if __CUDA_ARCH__ >= 800
__nv_bfloat162 scalar;
__device__ FuncPreMulSum(uint64_t opArg=0) {
union { uint64_t u64; __nv_bfloat16 val; };
u64 = opArg;
scalar.x = val;
scalar.y = val;
}
#else
float scalar;
__device__ FuncPreMulSum(uint64_t opArg=0) {
union { uint64_t u64; rccl_bfloat16 val; };
u64 = opArg;
scalar = (float)(val);
}
#endif
};
#endif
template<typename T>
struct Apply_Reduce<FuncPreMulSum<T>, /*EltPerPack=*/1> {
__device__ static BytePack<sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
// FuncPreMulSum reduce dispatches to FuncSum.
return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
}
};
// PreOp of FuncPreMulSum for integral types, float, and double.
template<typename T>
struct Apply_PreOp<FuncPreMulSum<T>, /*EltPerPack=*/1> {
static constexpr bool IsIdentity = false;
__device__ static BytePack<sizeof(T)> preOp(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a) {
return toPack<T>(fromPack<T>(a) * fn.scalar);
}
};
////////////////////////////////////////////////////////////////////////////////
// Apply_PreOp of FuncPreMulSum for float16.
template<>
struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
static constexpr bool IsIdentity = false;
__device__ static BytePack<sizeof(half)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half)> a) {
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
return toPack<half>(__hmul(fromPack<half>(a), fn.scalar.x));
#else
return toPack<half>(__float2half(__half2float(fromPack<half>(a)) * fn.scalar));
#endif
}
};
#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
template<>
struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/2> {
static constexpr bool IsIdentity = false;
__device__ static BytePack<sizeof(half2)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half2)> a) {
return toPack<half2>(__hmul2(fromPack<half2>(a), fn.scalar));
}
};
#endif
////////////////////////////////////////////////////////////////////////////////
// Apply_PreOp of FuncPreMulSum for bfloat16.
#if defined(RCCL_BFLOAT16)
template<>
struct Apply_PreOp<FuncPreMulSum<rccl_bfloat16>, /*EltPerPack=*/1> {
static constexpr bool IsIdentity = false;
__device__ static BytePack<sizeof(rccl_bfloat16)> preOp(
FuncPreMulSum<rccl_bfloat16> fn, BytePack<sizeof(rccl_bfloat16)> a
) {
#if __CUDA_ARCH__ >= 800
return toPack<__nv_bfloat16>(__hmul(fromPack<__nv_bfloat16>(a), fn.scalar.x));
#else
return toPack<rccl_bfloat16>((rccl_bfloat16)((float)(fromPack<rccl_bfloat16>(a)) * fn.scalar));
#endif
}
};
#if __CUDA_ARCH__ >= 800
template<>
struct Apply_PreOp<FuncPreMulSum<rccl_bfloat16>, /*EltPerPack=*/2> {
static constexpr bool IsIdentity = false;
__device__ static BytePack<sizeof(__nv_bfloat162)> preOp(
FuncPreMulSum<__nv_bfloat16> fn, BytePack<sizeof(__nv_bfloat162)> a
) {
return toPack<__nv_bfloat162>(__hmul2(fromPack<__nv_bfloat162>(a), fn.scalar));
}
};
#endif
#endif
////////////////////////////////////////////////////////////////////////////////
// FuncSumPostDiv
template<typename T>
struct IsFloatingPoint: std::false_type {};
template<>
struct IsFloatingPoint<half>: std::true_type {};
#if defined(RCCL_BFLOAT16)
template<>
struct IsFloatingPoint<rccl_bfloat16>: std::true_type {};
#endif
template<>
struct IsFloatingPoint<float>: std::true_type {};
template<>
struct IsFloatingPoint<double>: std::true_type {};
template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
struct FuncSumPostDiv_IntOnly;
template<typename T>
struct FuncSumPostDiv: FuncSumPostDiv_IntOnly<T> {
__device__ FuncSumPostDiv(uint64_t opArg=0):
FuncSumPostDiv_IntOnly<T>(opArg) {
}
};
template<typename T>
struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/false>: FuncSum<T> {
using EltType = T;
int divisor;
__device__ FuncSumPostDiv_IntOnly(uint64_t opArg=0): divisor(opArg) {}
};
template<typename T>
struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/true> {
static_assert(sizeof(T)!=sizeof(T), "FuncSumPostDiv is only for implementing ncclAvg on integral types.");
};
template<typename T>
struct Apply_Reduce<FuncSumPostDiv<T>, /*EltPerPack=*/1>:
Apply_Reduce<FuncSum<T>, 1> {
__device__ static BytePack<sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
// FuncSumPostDiv reduce dispatches to FuncSum.
return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
}
};
template<typename T>
struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
static constexpr bool IsIdentity = false;
__device__ static BytePack<sizeof(T)> postOp(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a) {
return toPack<T>(fromPack<T>(a) / fn.divisor);
}
};
////////////////////////////////////////////////////////////////////////////////
// Apply_LoadMultimem
#define SIZEOF_BytePack_field_u16 2
#define PTX_REG_BytePack_field_u16 "h"
#define SIZEOF_BytePack_field_u32 4
#define PTX_REG_BytePack_field_u32 "r"
#define SIZEOF_BytePack_field_u64 8
#define PTX_REG_BytePack_field_u64 "l"
#define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>, SIZEOF_BytePack_field_##pack_field> { \
static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
__device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
: "l"(addr)); \
return ans; \
} \
};
#define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
__device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
BytePack<PackSize> ans; \
asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
: "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
"=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
: "l"(addr)); \
return ans; \
} \
};
#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \
DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
template<> \
struct Apply_LoadMultimem<Fn<T>, sizeof(T)> { \
__device__ static BytePack<sizeof(T)> load(Fn<T> fn, uintptr_t addr) { \
BytePack<2*sizeof(T)> tmp; \
asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
: "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
: "l"(addr & -uintptr_t(sizeof(T)))); \
return tmp.half[(addr/sizeof(T))%2]; \
} \
};
template<typename Fn, int BytePerPack>
struct Apply_LoadMultimem {
__device__ static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
//__trap();
return {};
}
};
#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
template<typename Fn>
struct LoadMultimem_BigPackSize {
using T = typename Fn::EltType;
static constexpr bool IsSum = std::is_same<Fn, FuncSum<T>>::value ||
std::is_same<Fn, FuncPreMulSum<T>>::value ||
std::is_same<Fn, FuncSumPostDiv<T>>::value;
static constexpr bool IsMinOrMax = std::is_same<Fn, FuncMin<T>>::value ||
std::is_same<Fn, FuncMax<T>>::value;
static constexpr bool IsFloat = IsFloatingPoint<T>::value;
static constexpr int BigPackSize =
IsFloat && IsSum && sizeof(T) < 8 ? 16 :
IsFloat && IsSum ? 8 :
IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 :
!IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) :
/*multimem.ld_reduce not supported:*/ 0;
};
DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32)
DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32)
DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32)
DEFINE_Apply_LoadMultimem(FuncSum, int32_t, add, s32, u32)
DEFINE_Apply_LoadMultimem(FuncMin, int32_t, min, s32, u32)
DEFINE_Apply_LoadMultimem(FuncMax, int32_t, max, s32, u32)
DEFINE_Apply_LoadMultimem(FuncSum, uint64_t, add, u64, u64)
DEFINE_Apply_LoadMultimem(FuncMin, uint64_t, min, u64, u64)
DEFINE_Apply_LoadMultimem(FuncMax, uint64_t, max, u64, u64)
DEFINE_Apply_LoadMultimem(FuncSum, int64_t, add, u64, u64)
DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64)
DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64)
DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32)
DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32)
DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32)
#if defined(__CUDA_BF16_TYPES_EXIST__)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32)
DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32)
#endif
#else
template<typename Fn>
struct LoadMultimem_BigPackSize {
static constexpr int BigPackSize = 0;
};
#endif
#undef DEFINE_Apply_LoadMultimem
#undef DEFINE_Apply_LoadMultimem_v4
#undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf
#undef SIZEOF_BytePack_field_u64
#undef PTX_REG_BytePack_field_u64
#undef SIZEOF_BytePack_field_u32
#undef PTX_REG_BytePack_field_u32
#undef SIZEOF_BytePack_field_u16
#undef PTX_REG_BytePack_field_u16
#endif // REDUCE_KERNEL_H_
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
/*This file is now generated in CMake*/
// #include "reduce_scatter.h"
// #include "common.h"
// #include "collectives.h"
// IMPL_COLL_R(ReduceScatter);
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "primitives.h"
namespace {
template<typename T, typename RedOp, typename Proto>
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void runRing(ncclWorkElem *args) {
#else
__device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
#endif
const int tid = threadIdx.x;
const int nthreads = args->nWarps*WARP_SIZE;
const int bid = args->bid;
const int nChannels = args->nChannels;
ncclRing *ring = &ncclShmem.channel.ring;
int const *ringRanks = ring->userRanks;
const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
// We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
const int nranks = ncclShmem.comm.nRanks;
const ssize_t loopSize = nChannels*chunkSize;
const ssize_t size = args->count;
#if defined (ENABLE_TIMELINE)
int elems = 0, totalElems = 0;
uint64_t clkStamp = 0ULL;
struct ncclDevComm* comm = &ncclShmem.comm;
uint64_t entryStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_REDUCE_SCATTER_ENTRY, 0, entryStamp, comm->cpuTimestamp);
#endif
Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = ring->index;
INIT_PRIMS_SDMA(prims, args);
#endif
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t realChunkSize;
if (Proto::Id == NCCL_PROTO_SIMPLE) {
realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
}
else if (Proto::Id == NCCL_PROTO_LL)
realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
else if (Proto::Id == NCCL_PROTO_LL128)
realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
realChunkSize = int(realChunkSize);
ssize_t chunkOffset = gridOffset + bid*int(realChunkSize);
/////////////// begin ReduceScatter steps ///////////////
ssize_t offset;
int nelem = min(realChunkSize, size-chunkOffset);
int rankDest;
// step 0: push data to next GPU
rankDest = ringRanks[nranks-1];
offset = chunkOffset + rankDest * size;
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.send(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
// k-2 steps: reduce and copy to next GPU
for (int j=2; j<nranks; ++j) {
rankDest = ringRanks[nranks-j];
offset = chunkOffset + rankDest * size;
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.recvReduceSend(offset, nelem);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
// step k-1: reduce this buffer and data, which will produce the final result
rankDest = ringRanks[0];
offset = chunkOffset + rankDest * size;
#if defined (ENABLE_TIMELINE)
elems = max(0, nelem);
clkStamp = __builtin_amdgcn_s_memrealtime();
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_COPY_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp);
#endif
prims.recvReduceCopy(offset, chunkOffset, nelem, /*postOp=*/true);
#if defined (ENABLE_TIMELINE)
totalElems += elems;
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_COPY_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp);
#endif
}
#if defined (ENABLE_TIMELINE)
Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_REDUCE_SCATTER_EXIT, totalElems*sizeof(T), __builtin_amdgcn_s_memrealtime() - entryStamp, comm->cpuTimestamp);
#endif
}
}
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
runRing<T, RedOp, Proto>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
runRing<T, RedOp, ProtoLL128>(args);
}
};
template<typename T, typename RedOp>
struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
__device__ __forceinline__ void run(ncclWorkElem *args) {
const int tid = threadIdx.x;
const int bid = args->bid;
const int nChannels = args->nChannels;
struct ncclNvls* nvls = &ncclShmem.channel.nvls;
const ssize_t chunkSize = int(args->lastChunkSize);
const ssize_t size = args->count;
const ssize_t loopSize = nChannels*chunkSize;
const int nThreadsScatter = 128 + WARP_SIZE;
const int nThreadsReduce = 384;
const int tidEndScatter = nThreadsScatter;
const int tidEndReduce = tidEndScatter + nThreadsReduce;
using Proto = ProtoSimple<1, 1>;
if (tid < tidEndScatter) {
// Scatter
Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
}
} else if (tid < tidEndReduce) {
// Reduce through NVLS
Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
prims.recv(offset, nelem);
}
}
}
};
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "sendrecv.h"
#include "common.h"
#include "collectives.h"
IMPL_COLL_P(SendRecv);
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "devcomm.h"
#include "collectives.h"
#include "primitives.h"
#if defined(ENABLE_NPKIT)
#include "npkit/npkit.h"
#endif
template<typename T, typename RedOp>
struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
template<typename Proto>
__device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
#if defined(ENABLE_NPKIT)
bool isNpKitThread = (tid == 0);
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + group;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (isNpKitThread) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
if (args->peer == ncclShmem.comm.rank) {
struct ncclWorkElemP2p* recvArgs = args-1;
void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
if (buff != recvBuff) {
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
(tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
} else {
int chunkSize = args->chunkSize/sizeof(T);
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
(tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = 0;
INIT_PRIMS_SDMA(prims, args);
#endif
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
size_t offset = 0;
do {
int nelem = min(size_t(chunkSize), count-offset);
prims.directSend(offset, offset, nelem);
offset += nelem;
} while(offset < count);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#ifdef HYGON_SDMA_FEATURE
if (tid == 0 && prims.useSdmaCopy && prims.sdmaCountEnabe) {
*ncclShmem.channel.sdmaQueue.ptrSdmaCopyCount += prims.sdmaCopyCount;
*ncclShmem.channel.sdmaQueue.ptrAllCopyCount += prims.allCopyCount;
PRINT_DEBUG("runSend bid:%d sdmaCopyCount:%d allCopyCount:%d sumSdma:%d sumAll:%d\n",
(int)blockIdx.x, prims.sdmaCopyCount, prims.allCopyCount,
*ncclShmem.channel.sdmaQueue.ptrSdmaCopyCount,
*ncclShmem.channel.sdmaQueue.ptrAllCopyCount);
}
#endif
}
}
template<typename Proto>
__device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
#if defined(ENABLE_NPKIT)
bool isNpKitThread = (tid == 0);
int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + group;
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
if (isNpKitThread) {
uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
if (args->peer != ncclShmem.comm.rank) {
void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
int chunkSize = args->chunkSize/sizeof(T);
if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
int const peer = args->peer;
Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
(tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex);
#ifdef HYGON_SDMA_FEATURE
prims.ringIx = 0;
prims.useSdmaCopy = 0;
#endif
#if defined(ENABLE_NPKIT)
if (isNpKitThread) {
prims.npKitCtxIdx = npKitCtxIdx;
}
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
prims.npKitDataProcessTotalTime = 0;
}
#endif
size_t offset = 0;
do {
int nelem = min(size_t(chunkSize), count-offset);
prims.directRecv(offset, nelem);
offset += nelem;
} while(offset < count);
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
if (isNpKitThread) {
NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
}
#endif
}
}
#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
__device__ void run(ncclWork *work) {
#else
__device__ __attribute__((noinline)) void run(ncclWork *work) {
#endif
struct ncclWorkElemP2p* args = work->p2pElems;
int ngroups = args->ngroups;
int tid = threadIdx.x;
int wid = tid / WARP_SIZE;
// This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
// warps for send, 2 warps for recv).
// warpStarts were rounded thanks to int division, but for group number we need to round the other way around
// So we mirror wid then mirror again the group.
#define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
args += group;
tid -= args->warpStart * WARP_SIZE;
int nthreads = args->nWarps * WARP_SIZE;
if (args->p2pType == ncclWorkP2pTypeUnused) return;
if (tid >= nthreads || args->peer == -1) return;
// Select Proto here
// This is to allow the same kernel to run multiple primitives on different warps (thread groups)
if ((group%2) == 0) {
if (args->proto == NCCL_PROTO_LL) {
runRecv<ProtoLL>(tid, nthreads, group, args);
} else {
#if defined(__gfx90a__)
runRecv<ProtoSimple<1,1,8>>(tid, nthreads, group, args);
#elif defined(__gfx908__)
runRecv<ProtoSimple<1,1,4>>(tid, nthreads, group, args);
#else
runRecv<ProtoSimple<1,1>>(tid, nthreads, group, args);
#endif
}
} else {
if (args->proto == NCCL_PROTO_LL) {
runSend<ProtoLL>(tid, nthreads, group, args);
} else {
#if defined(__gfx90a__)
runSend<ProtoSimple<1,1,8>>(tid, nthreads, group, args);
#elif defined(__gfx908__)
runSend<ProtoSimple<1,1,4>>(tid, nthreads, group, args);
#else
runSend<ProtoSimple<1,1>>(tid, nthreads, group, args);
#endif
}
}
}
};
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "msccl/msccl_lifecycle.h"
NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = sendcount * ncclTypeSize(datatype);
if (sendcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
}
/*************************************************************************
* Copyright (c) Microsoft Corporation.
* Licensed under the MIT License.
************************************************************************/
#include "enqueue.h"
#include "msccl/msccl_parser.h"
#include "msccl/msccl_setup.h"
#include "msccl/msccl_status.h"
#include <cstdio>
#include <cstdlib>
NCCL_API(ncclResult_t, mscclLoadAlgo, const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank) {
mscclStatus& status = mscclGetStatus();
if (status.freeAlgoHandles.size() == 0) {
WARN("MSCCL: MSCCL_MAX_NUM_ALGOS (%d) limit reached", MSCCL_MAX_NUM_ALGOS);
return ncclInvalidUsage;
}
*mscclAlgoHandle = *status.freeAlgoHandles.rbegin();
status.freeAlgoHandles.pop_back();
struct mscclAlgo* hostAlgo;
NCCLCHECK(ncclCalloc(&hostAlgo, 1));
NCCLCHECK(mscclGetAlgoFromXmlFile(mscclAlgoFilePath, hostAlgo, rank));
status.hostAlgos[*mscclAlgoHandle] = hostAlgo;
struct mscclAlgo* devAlgo;
NCCLCHECK(ncclCudaMalloc(&devAlgo, 1));
CUDACHECK(hipMemcpy(devAlgo, hostAlgo, sizeof(struct mscclAlgo), hipMemcpyHostToDevice));
status.devAlgos[*mscclAlgoHandle] = devAlgo;
return ncclSuccess;
}
NCCL_API(ncclResult_t, mscclRunAlgo,
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
ncclResult_t mscclRunAlgo(
const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream) {
mscclStatus& status = mscclGetStatus();
struct mscclAlgo* hostAlgo = status.hostAlgos[mscclAlgoHandle];
struct mscclAlgo* devAlgo = status.devAlgos[mscclAlgoHandle];
NCCLCHECK(mscclGetCaptureStatus(stream));
NCCLCHECK(mscclSetupCount(hostAlgo, comm, count, dataType));
NCCLCHECK(mscclSetupScratch(hostAlgo, stream));
NCCLCHECK(mscclSetupSyncFlags(stream));
if (status.connectedAlgos[comm].find(mscclAlgoHandle) == status.connectedAlgos[comm].end()) {
hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
NCCLCHECK(mscclSetupConnections(hostAlgo, comm));
CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
status.connectedAlgos[comm].insert(mscclAlgoHandle);
}
NCCLCHECK(mscclSetupProxy(hostAlgo, comm, stream));
NCCLCHECK(mscclSetupKernel(sendBuff, recvBuff, count, dataType, op, hostAlgo, devAlgo, comm, stream));
return ncclSuccess;
}
NCCL_API(ncclResult_t, mscclUnloadAlgo, mscclAlgoHandle_t mscclAlgoHandle);
ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle) {
mscclStatus& status = mscclGetStatus();
free(status.hostAlgos[mscclAlgoHandle]);
status.hostAlgos.erase(mscclAlgoHandle);
NCCLCHECK(ncclCudaFree(status.devAlgos[mscclAlgoHandle]));
status.devAlgos.erase(mscclAlgoHandle);
status.freeAlgoHandles.push_back(mscclAlgoHandle);
for (auto &s : status.connectedAlgos) {
s.second.erase(mscclAlgoHandle);
}
return ncclSuccess;
}
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "nccl.h"
#include "msccl/msccl_lifecycle.h"
#include "hipprof/hip_prof_rccl_api.h"
NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
struct NvtxParamsReduce {
size_t bytes;
int root;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduce, op)}
};
NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
count, datatype, root, 0, op, mscclFuncReduce, comm, stream);
}
struct ncclInfo info = { ncclFuncReduce, "Reduce",
sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
RCCL_CB_SPAWNER_OBJECT(ncclReduce, &info);
return ncclEnqueueCheck(&info);
}
/*************************************************************************
* Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "nccl.h"
#include "msccl/msccl_lifecycle.h"
#include "hipprof/hip_prof_rccl_api.h"
NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
struct NvtxParamsReduceScatter {
size_t bytes;
ncclRedOp_t op;
};
constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
{0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
offsetof(NvtxParamsReduceScatter, op)}
};
NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
recvcount, datatype, 0, 0, op, mscclFuncReduceScatter, comm, stream);
}
struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
RCCL_CB_SPAWNER_OBJECT(ncclReduceScatter, &info);
return ncclEnqueueCheck(&info);
}
/*************************************************************************
* Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "msccl/msccl_lifecycle.h"
NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream);
ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
}
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
size_t rankOffset = recvcount * ncclTypeSize(datatype);
if (recvcount == 0) return ncclSuccess;
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
NCCLCHECK(ncclGroupStart());
if (rank == root) {
for (int r=0; r<nRanks; r++)
NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
}
NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
NCCLCHECK(ncclGroupEnd());
return ncclSuccess;
}
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
*
* See LICENSE.txt for license information
************************************************************************/
#include "enqueue.h"
#include "collectives.h"
#include "argcheck.h" // Need some checks here since we access comm
#include "msccl/msccl_lifecycle.h"
#include "hipprof/hip_prof_rccl_api.h"
struct NvtxParamsSendRecv {
size_t bytes;
int peer;
};
constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
{0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
{0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
};
NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
sendbuff, nullptr, nullptr, nullptr, nullptr, nullptr,
count, datatype, 0, peer, ncclSum, mscclFuncSend, comm, stream);
}
struct ncclInfo info = { ncclFuncSend, "Send",
NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
RCCL_CB_SPAWNER_OBJECT(ncclSend, &info);
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
ret = ncclEnqueueCheck(&info);
NCCLCHECK(ncclGroupEnd());
return ret;
}
NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream);
ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
ncclComm_t comm, cudaStream_t stream) {
NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
if (mscclAvailable() && !mscclIsCaller()) {
return mscclEnqueueCheck(
nullptr, nullptr, nullptr, recvbuff, nullptr, nullptr,
count, datatype, 0, peer, ncclSum, mscclFuncRecv, comm, stream);
}
struct ncclInfo info = { ncclFuncRecv, "Recv",
NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
1, 1 };
RCCL_CB_SPAWNER_OBJECT(ncclRecv, &info);
ncclResult_t ret;
NCCLCHECK(ncclGroupStart());
ret = ncclEnqueueCheck(&info);
NCCLCHECK(ncclGroupEnd());
return ret;
}
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "core.h"
#include "nccl_net.h"
#include <stdlib.h>
#include <stdarg.h>
#include <sys/syscall.h>
int ncclDebugLevel = -1;
static int pid = -1;
static char hostname[1024];
thread_local int ncclDebugNoWarn = 0;
char ncclLastError[1024] = ""; // Global string for the last error in human readable form
uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
FILE *ncclDebugFile = stdout;
pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
std::chrono::steady_clock::time_point ncclEpoch;
static __thread int tid = -1;
void ncclDebugInit() {
pthread_mutex_lock(&ncclDebugLock);
if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
const char* nccl_debug = getenv("NCCL_DEBUG");
int tempNcclDebugLevel = -1;
if (nccl_debug == NULL) {
tempNcclDebugLevel = NCCL_LOG_NONE;
} else if (strcasecmp(nccl_debug, "VERSION") == 0) {
tempNcclDebugLevel = NCCL_LOG_VERSION;
} else if (strcasecmp(nccl_debug, "WARN") == 0) {
tempNcclDebugLevel = NCCL_LOG_WARN;
} else if (strcasecmp(nccl_debug, "INFO") == 0) {
tempNcclDebugLevel = NCCL_LOG_INFO;
} else if (strcasecmp(nccl_debug, "ABORT") == 0) {
tempNcclDebugLevel = NCCL_LOG_ABORT;
} else if (strcasecmp(nccl_debug, "TRACE") == 0) {
tempNcclDebugLevel = NCCL_LOG_TRACE;
}
/* Parse the NCCL_DEBUG_SUBSYS env var
* This can be a comma separated list such as INIT,COLL
* or ^INIT,COLL etc
*/
char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
if (ncclDebugSubsysEnv != NULL) {
int invert = 0;
if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
ncclDebugMask = invert ? ~0ULL : 0ULL;
char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
char *subsys = strtok(ncclDebugSubsys, ",");
while (subsys != NULL) {
uint64_t mask = 0;
if (strcasecmp(subsys, "INIT") == 0) {
mask = NCCL_INIT;
} else if (strcasecmp(subsys, "COLL") == 0) {
mask = NCCL_COLL;
} else if (strcasecmp(subsys, "P2P") == 0) {
mask = NCCL_P2P;
} else if (strcasecmp(subsys, "SHM") == 0) {
mask = NCCL_SHM;
} else if (strcasecmp(subsys, "NET") == 0) {
mask = NCCL_NET;
} else if (strcasecmp(subsys, "GRAPH") == 0) {
mask = NCCL_GRAPH;
} else if (strcasecmp(subsys, "TUNING") == 0) {
mask = NCCL_TUNING;
} else if (strcasecmp(subsys, "ENV") == 0) {
mask = NCCL_ENV;
} else if (strcasecmp(subsys, "ALLOC") == 0) {
mask = NCCL_ALLOC;
} else if (strcasecmp(subsys, "CALL") == 0) {
mask = NCCL_CALL;
} else if (strcasecmp(subsys, "PROXY") == 0) {
mask = NCCL_PROXY;
} else if (strcasecmp(subsys, "NVLS") == 0) {
mask = NCCL_NVLS;
} else if (strcasecmp(subsys, "ALL") == 0) {
mask = NCCL_ALL;
}
if (mask) {
if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
}
subsys = strtok(NULL, ",");
}
free(ncclDebugSubsys);
}
// Cache pid and hostname
getHostName(hostname, 1024, '.');
pid = getpid();
/* Parse and expand the NCCL_DEBUG_FILE path and
* then create the debug file. But don't bother unless the
* NCCL_DEBUG level is > VERSION
*/
const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
int c = 0;
char debugFn[PATH_MAX+1] = "";
char *dfn = debugFn;
while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
if (ncclDebugFileEnv[c++] != '%') {
*dfn++ = ncclDebugFileEnv[c-1];
continue;
}
switch (ncclDebugFileEnv[c++]) {
case '%': // Double %
*dfn++ = '%';
break;
case 'h': // %h = hostname
dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
break;
case 'p': // %p = pid
dfn += snprintf(dfn, PATH_MAX, "%d", pid);
break;
default: // Echo everything we don't understand
*dfn++ = '%';
*dfn++ = ncclDebugFileEnv[c-1];
break;
}
}
*dfn = '\0';
if (debugFn[0] != '\0') {
FILE *file = fopen(debugFn, "w");
if (file != nullptr) {
setbuf(file, nullptr); // disable buffering
ncclDebugFile = file;
}
}
}
ncclEpoch = std::chrono::steady_clock::now();
__atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
pthread_mutex_unlock(&ncclDebugLock);
}
/* Common logging function used by the INFO, WARN and TRACE macros
* Also exported to the dynamically loadable Net transport modules so
* they can share the debugging mechanisms and output files
*/
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
// Save the last error (WARN) as a human readable string
if (level == NCCL_LOG_WARN) {
pthread_mutex_lock(&ncclDebugLock);
va_list vargs;
va_start(vargs, fmt);
(void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
va_end(vargs);
pthread_mutex_unlock(&ncclDebugLock);
}
if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
if (tid == -1) {
tid = syscall(SYS_gettid);
}
int cudaDev;
if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
cudaGetDevice(&cudaDev);
}
char buffer[1024];
size_t len = 0;
if (level == NCCL_LOG_WARN) {
len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
hostname, pid, tid, cudaDev, filefunc, line);
} else if (level == NCCL_LOG_INFO) {
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
} else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
} else if (level == NCCL_LOG_TRACE) {
auto delta = std::chrono::steady_clock::now() - ncclEpoch;
double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
hostname, pid, tid, cudaDev, timestamp, filefunc, line);
}
if (len) {
va_list vargs;
va_start(vargs, fmt);
len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
va_end(vargs);
buffer[len++] = '\n';
fwrite(buffer, 1, len, ncclDebugFile);
}
}
NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
// pthread_setname_np is nonstandard GNU extension
// needs the following feature test macro
#ifdef _GNU_SOURCE
if (ncclParamSetThreadName() != 1) return;
char threadName[NCCL_THREAD_NAMELEN];
va_list vargs;
va_start(vargs, fmt);
vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs);
va_end(vargs);
pthread_setname_np(thread, threadName);
#endif
}
/*************************************************************************
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
/* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
enum cudaError_t { cudaErrorStubLibrary = 34 };
extern "C" {
cudaError_t cudaStreamGetCaptureInfo_v2(...) __attribute__((visibility("hidden"))) __attribute((weak));
cudaError_t cudaStreamGetCaptureInfo_v2(...) { return cudaErrorStubLibrary; }
cudaError_t cudaUserObjectCreate(...) __attribute__((visibility("hidden"))) __attribute((weak));
cudaError_t cudaUserObjectCreate(...) { return cudaErrorStubLibrary; }
cudaError_t cudaGraphRetainUserObject(...) __attribute__((visibility("hidden"))) __attribute((weak));
cudaError_t cudaGraphRetainUserObject(...) { return cudaErrorStubLibrary; }
cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
cudaError_t cudaGetDriverEntryPoint(...) __attribute__((visibility("hidden"))) __attribute((weak));
cudaError_t cudaGetDriverEntryPoint(...) { return cudaErrorStubLibrary; }
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment