/************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #ifndef NCCL_COMMON_KERNEL_H_ #define NCCL_COMMON_KERNEL_H_ #include #include // Define min for ssize_t static __device__ int min(int a, ssize_t b) { return (a < b) ? a : b; } typedef uint64_t PackType; template struct MULTI { __device__ PackType operator()(const PackType x, const PackType y) const { return FUNC()(x, y); } }; template inline __device__ T vFetch(const volatile T* ptr) { return __builtin_nontemporal_load(ptr); } template inline __device__ void vStore(volatile T* ptr, const T val) { __builtin_nontemporal_store(val, ptr); } template struct FuncPassA { __device__ T operator()(const T x, const T y) const { return x; } }; template struct FuncSum { __device__ T operator()(const T x, const T y) const { return x + y; } }; template struct MULTI { static_assert(sizeof(PackType) == 2 * sizeof(float), "PackType must be twice the size of float."); union converter { PackType storage; struct { float a, b; }; }; __device__ PackType operator()(const PackType x, const PackType y) const { converter cx, cy, cr; cx.storage = x; cy.storage = y; cr.a = FUNC()(cx.a, cy.a); cr.b = FUNC()(cx.b, cy.b); return cr.storage; } }; typedef ulong2 Pack128; template struct MULTI128 { __device__ void operator()(Pack128& x, Pack128& y) { x.x = MULTI()(x.x, y.x); x.y = MULTI()(x.y, y.y); } }; inline __device__ void Fetch128(Pack128& v, const Pack128* p) { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) v.x = __builtin_nontemporal_load(&p->x); v.y = __builtin_nontemporal_load(&p->y); #else asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory"); #endif } inline __device__ void Store128(Pack128* p, Pack128& v) { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) __builtin_nontemporal_store(v.x, &p->x); __builtin_nontemporal_store(v.y, &p->y); #else asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory"); #endif } #define WARP_SIZE 64 template __device__ __forceinline__ void ReduceCopyMulti(const int w, const int nw, const int t, int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Nelem) { const int inc = nw * UNROLL * WARP_SIZE; int offset = w * UNROLL * WARP_SIZE + t; const T* srcs[MAXSRCS]; for (int i=0; i __device__ void ReduceCopy128bMulti(const int w, const int nw, const int t, int nsrcs, const T** s, int ndsts, T** d, const int elemOffset, const int Npack) { const int inc = nw * UNROLL * WARP_SIZE; int offset = w * UNROLL * WARP_SIZE + t; const Pack128* srcs[MAXSRCS]; for (int i=0; i()(vals[u], vals2[u]); } for (int i=MINSRCS; i()(vals[u], vals2[u]); } // Store for (int i = 0; i < MINDSTS; i++) { for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]); } for (int i=MINDSTS; i __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); } #define PACKELEMS (sizeof(Pack128) / sizeof(T)) #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) // Multiply UNROLL by 2 if single source/single destination #define AUTOUNROLL (UNROLL*((MINSRCS==1 && MINDSTS==1) ? 2 : 1)) #endif template __device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads, int nsrcs, const T** srcs, int ndsts, T** dsts, int N) { int Nrem = N; if (Nrem <= 0) return; int w = tid / WARP_SIZE; // Warp number int nw = nthreads / WARP_SIZE; // Number of warps int t = tid % WARP_SIZE; // Thread (inside the warp) // Check that all is 16B aligned. If not don't use 16B load/stores. int align = 0; #pragma unroll for (int i=0; i(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack); Nrem -= Nelem; if (Nrem == 0) return; offset += Nelem; // slightly less optimized for section when we don't have full unrolling Npack = Nrem / PACKELEMS; Nelem = Npack * PACKELEMS; ReduceCopy128bMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack); Nrem -= Nelem; if (Nrem == 0) return; offset += Nelem; } // unrolled, by-type (mostly for unaligned buffers) int Nelem = (Nrem / (UNROLL*PACKELEMS/2*WARP_SIZE)) * (UNROLL*PACKELEMS/2*WARP_SIZE); // round down ReduceCopyMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nelem); Nrem -= Nelem; if (Nrem == 0) return; offset += Nelem; // no unroll, by type. Should finish what's remaining. ReduceCopyMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Nrem); } #endif