Initial commit: RCCL auto-tuning project

7dc4e964 · wanghan · 7dc4e964 · 7dc4e964 · 7dc4e964 · 7dc4e964
Commit 7dc4e964 authored Apr 02, 2026 by wanghan
20 changed files
--- a/rccl/src/collectives/device/op128.h
+++ b/rccl/src/collectives/device/op128.h
+/*************************************************************************
+ * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef OP128_H_
+#define OP128_H_
+
+#include <type_traits>
+
+inline __device__ void load128(const uint64_t* ptr, uint64_t &v0, uint64_t &v1) {
+  v0 = __builtin_nontemporal_load(ptr);
+  v1 = __builtin_nontemporal_load(ptr+1);
+}
+
+inline __device__ void store128(uint64_t* ptr, uint64_t v0, uint64_t v1) {
+  __builtin_nontemporal_store(v0, ptr);
+  __builtin_nontemporal_store(v1, ptr+1);
+}
+
+inline __device__ uint64_t* shmemCvtPtr(volatile uint64_t* shmemGenericPtr) {
+  return (uint64_t*)shmemGenericPtr;
+}
+
+inline __device__ void loadShmem128(uint64_t* shmemAsmPtr, uint64_t &v0, uint64_t &v1) {
+  v0 = *(shmemAsmPtr);
+  v1 = *(shmemAsmPtr+1);
+}
+
+inline __device__ void storeShmem128(uint64_t* shmemAsmPtr, uint64_t v0, uint64_t v1) {
+  *(shmemAsmPtr) = v0;
+  *(shmemAsmPtr+1) = v1;
+}
+
+template<typename T>
+inline __device__ void loadShmemMisaligned128(T *ptr, uint64_t &v0, uint64_t &v1) {
+  union {
+    uint32_t tmp4[4];
+    uint64_t tmp8[2];
+  };
+  if(sizeof(T) < 4) {
+    uint32_t *ptr4 = reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(ptr) & -uintptr_t(4));
+    #pragma unroll
+    for(int e=0; e < 4; e++) {
+      // Produce 4 bytes of sub-register type by reading 2 4-byte
+      // aligned values and shifting.
+      uint32_t lo, hi;
+      lo = __builtin_nontemporal_load(ptr4+e+0);
+      hi = __builtin_nontemporal_load(ptr4+e+1);
+      tmp4[e] = __funnelshift_r(lo, hi, 8*(int(reinterpret_cast<uintptr_t>(ptr))%4));
+    }
+  }
+  else if(sizeof(T) == 4) {
+    #pragma unroll
+    for(int e=0; e < 4; e++)
+      tmp4[e] = __builtin_nontemporal_load(ptr+e);
+  }
+  else /*sizeof(T)==8*/ {
+    #pragma unroll
+    for(int e=0; e < 2; e++)
+      tmp8[e] = __builtin_nontemporal_load(ptr+e);
+  }
+  v0 = tmp8[0];
+  v1 = tmp8[1];
+}
+
+
+template<typename T>
+__device__ __forceinline__ uint32_t cvta_to_shared(T* ptr) {
+  return (uint32_t)(uint64_t)(ptr);
+}
+template<typename T>
+__device__ __forceinline__ uintptr_t cvta_to_global(T* ptr) {
+  return (uintptr_t)(ptr);
+}
+
+template<typename T>
+__device__ __forceinline__ T* cvta_from_shared(uint32_t shptr) {
+  return (T*)shptr;
+}
+template<typename T>
+__device__ __forceinline__ T* cvta_from_global(uintptr_t gptr) {
+  return (T*)gptr;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// BytePack<Size>: struct of bytes.
+
+template<int Size>
+union BytePack;
+template<>
+union BytePack<0> {};
+template<>
+union BytePack<1> {
+  uint8_t u8, native;
+};
+template<>
+union BytePack<2> {
+  BytePack<1> half[2];
+  uint8_t u8[2];
+  uint16_t u16, native;
+};
+template<>
+union BytePack<4> {
+  BytePack<2> half[2];
+  uint8_t u8[4];
+  uint16_t u16[2];
+  uint32_t u32, native;
+};
+template<>
+union BytePack<8> {
+  BytePack<4> half[2];
+  uint8_t u8[8];
+  uint16_t u16[4];
+  uint32_t u32[2];
+  uint64_t u64, native;
+};
+template<>
+union alignas(16) BytePack<16> {
+  BytePack<8> half[2];
+  uint8_t u8[16];
+  uint16_t u16[8];
+  uint32_t u32[4];
+  uint64_t u64[2];
+  ulong2 ul2, native;
+#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+  inline __device__ BytePack<16>& operator=(BytePack<16> other) {
+    u64[0] = other.u64[0];
+    u64[1] = other.u64[1];
+    return *this;
+  }
+#endif
+};
+
+template<typename T>
+struct BytePackOf {
+  static constexpr int Size = sizeof(T);
+  using Pack = BytePack<Size>;
+};
+template<>
+struct BytePackOf<BytePack<0>> {
+  static constexpr int Size = 0;
+  using Pack = BytePack<0>;
+};
+
+template<typename T>
+__device__ __forceinline__ typename BytePackOf<T>::Pack toPack(T value)  {
+  union { typename BytePackOf<T>::Pack p; T v; };
+  v = value;
+  return p;
+}
+
+template<typename T>
+__device__ __forceinline__ T fromPack(typename BytePackOf<T>::Pack pack)  {
+  union { typename BytePackOf<T>::Pack p; T v; };
+  p = pack;
+  return v;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Load/store of BytePack<?> using integral addresses.
+
+template<int Size> __device__ BytePack<Size> ld_global(uintptr_t addr);
+template<int Size> __device__ BytePack<Size> ld_volatile_global(uintptr_t addr);
+//template<int Size> __device__ BytePack<Size> ld_shared(uint32_t addr);
+//template<int Size> __device__ BytePack<Size> ld_volatile_shared(uint32_t addr);
+template<int Size> __device__ void st_global(uintptr_t addr, BytePack<Size> value);
+//template<int Size> __device__ void st_shared(uint32_t addr, BytePack<Size> value);
+
+template<> __device__ __forceinline__ BytePack<0> ld_global<0>(uintptr_t addr) { return {}; }
+template<> __device__ __forceinline__ BytePack<0> ld_volatile_global<0>(uintptr_t addr) { return {}; }
+//template<> __device__ __forceinline__ BytePack<0> ld_shared<0>(uint32_t addr) { return {}; }
+//template<> __device__ __forceinline__ BytePack<0> ld_volatile_shared<0>(uint32_t addr) { return {}; }
+template<> __device__ __forceinline__ void st_global<0>(uintptr_t addr, BytePack<0> value) {}
+//template<> __device__ __forceinline__ void st_shared<0>(uint32_t addr, BytePack<0> value) {}
+
+// Used to define implementations for above prototypes.
+#define DEFINE_ld_st(bytes, data_cxx_ty, data_ptx_ty, data_reg_ty, space, addr_cxx_ty, addr_reg_ty) \
+  template<> \
+  __device__ __forceinline__ BytePack<bytes> ld_##space<bytes>(addr_cxx_ty addr) { \
+    data_cxx_ty tmp; \
+    tmp = *((data_cxx_ty *)addr); \
+    BytePack<bytes> ans; \
+    ans.native = tmp; \
+    return ans; \
+  } \
+  template<> \
+  __device__ __forceinline__ BytePack<bytes> ld_volatile_##space<bytes>(addr_cxx_ty addr) { \
+    data_cxx_ty tmp; \
+    tmp =  __builtin_nontemporal_load((data_cxx_ty *)addr); \
+    BytePack<bytes> ans; \
+    ans.native = tmp; \
+    return ans; \
+  } \
+  template<> \
+  __device__ __forceinline__ void st_##space<bytes>(addr_cxx_ty addr, BytePack<bytes> value) { \
+    data_cxx_ty tmp = value.native; \
+    *((data_cxx_ty *)addr) = tmp; \
+  }
+// Single-byte types use 4-byte registers since there is no 1-byte register
+// character for asm blocks. See https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html#constraints
+DEFINE_ld_st(1, uint8_t, b8, r, global, uintptr_t, l)
+//DEFINE_ld_st(1, uint32_t, b8, r, shared, uint32_t, r)
+DEFINE_ld_st(2, uint16_t, b16, h, global, uintptr_t, l)
+//DEFINE_ld_st(2, uint16_t, b16, h, shared, uint32_t, r)
+DEFINE_ld_st(4, uint32_t, b32, r, global, uintptr_t, l)
+//DEFINE_ld_st(4, uint32_t, b32, r, shared, uint32_t, r)
+DEFINE_ld_st(8, uint64_t, b64, l, global, uintptr_t, l)
+//DEFINE_ld_st(8, uint64_t, b64, l, shared, uint32_t, r)
+#undef DEFINE_ld_st
+
+#define DEFINE_ld_st_16(space, addr_cxx_ty, addr_reg_ty) \
+  template<> \
+  __device__ __forceinline__ BytePack<16> ld_##space<16>(addr_cxx_ty addr) { \
+    BytePack<16> ans; \
+    ans.u64[0] = *((uint64_t*)addr); \
+    ans.u64[1] = *((uint64_t*)addr+1); \
+    return ans; \
+  } \
+  template<> \
+  __device__ __forceinline__ BytePack<16> ld_volatile_##space<16>(addr_cxx_ty addr) { \
+    BytePack<16> ans; \
+    ans.u64[0] = __builtin_nontemporal_load((uint64_t*)addr); \
+    ans.u64[1] = __builtin_nontemporal_load((uint64_t*)addr+1); \
+    return ans; \
+  } \
+  template<> \
+  __device__ __forceinline__ void st_##space<16>(addr_cxx_ty addr, BytePack<16> value) { \
+    *((uint64_t*)addr) = value.u64[0]; \
+    *((uint64_t*)addr+1) = value.u64[1]; \
+  }
+DEFINE_ld_st_16(global, uintptr_t, l)
+//DEFINE_ld_st_16(shared, uint32_t, r)
+#undef DEFINE_ld_st_16
+
+////////////////////////////////////////////////////////////////////////////////
+// Atomic load/store using c++ pointers.
+
+__device__ __forceinline__ uint64_t ld_volatile_global(uint64_t *ptr) {
+  uint64_t ans;
+  ans = __builtin_nontemporal_load(ptr);
+  return ans;
+}
+__device__ __forceinline__ uint64_t ld_relaxed_sys_global(uint64_t *ptr) {
+  uint64_t ans;
+  ans = __builtin_nontemporal_load(ptr);
+  return ans;
+}
+__device__ __forceinline__ uint64_t ld_acquire_sys_global(uint64_t *ptr) {
+  uint64_t ans;
+  ans = __atomic_load_n(ptr ,__ATOMIC_SEQ_CST);
+  return ans;
+}
+
+__device__ __forceinline__ void st_volatile_global(uint64_t *ptr, uint64_t val) {
+  __builtin_nontemporal_store(val, ptr);
+}
+__device__ __forceinline__ void st_relaxed_sys_global(uint64_t *ptr, uint64_t val) {
+  __builtin_nontemporal_store(val, ptr);
+}
+__device__ __forceinline__ void st_release_sys_global(uint64_t *ptr, uint64_t val) {
+  __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST);
+}
+
+__device__ __forceinline__ void fence_acq_rel_sys() {
+    //asm volatile("membar.sys;" ::: "memory");
+}
+__device__ __forceinline__ void fence_acq_rel_gpu() {
+    //asm volatile("membar.gl;" ::: "memory");
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Multimem stores of BytePack<?>.
+
+template<int Size>
+__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val);
+
+#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
+template<>
+__device__ __forceinline__ void multimem_st_global<0>(uintptr_t addr, BytePack<0> val) {
+  // nop
+}
+template<>
+__device__ __forceinline__ void multimem_st_global<1>(uintptr_t addr, BytePack<1> val) {
+  asm volatile("st.global.b8 [%0], %1;" :: "l"(addr), "r"((uint32_t)val.u8) : "memory");
+}
+template<>
+__device__ __forceinline__ void multimem_st_global<2>(uintptr_t addr, BytePack<2> val) {
+  asm volatile("st.global.b16 [%0], %1;" :: "l"(addr), "h"(val.u16) : "memory");
+}
+template<>
+__device__ __forceinline__ void multimem_st_global<4>(uintptr_t addr, BytePack<4> val) {
+  asm volatile("multimem.st.global.b32 [%0], %1;" :: "l"(addr), "r"(val.u32) : "memory");
+}
+template<>
+__device__ __forceinline__ void multimem_st_global<8>(uintptr_t addr, BytePack<8> val) {
+  asm volatile("multimem.st.global.b64 [%0], %1;" :: "l"(addr), "l"(val.u64) : "memory");
+}
+template<>
+__device__ __forceinline__ void multimem_st_global<16>(uintptr_t addr, BytePack<16> val) {
+  asm volatile("multimem.st.global.v4.f32 [%0], {%1,%2,%3,%4};"
+    :: "l"(addr), "r"(val.u32[0]), "r"(val.u32[1]), "r"(val.u32[2]), "r"(val.u32[3])
+    : "memory");
+}
+#else
+template<int Size>
+__device__ __forceinline__ void multimem_st_global(uintptr_t addr, BytePack<Size> val) {
+  // nop
+}
+#endif
+
+#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
+// Warp-uniform memory copy from shared address (not generic) to global memory.
+// The number of bytes copied is `min(MaxBytes, nBytesAhead)`, a negative value
+// is interpeted as zero. EltSize is the guaranteed alignment of the addresses and sizes.
+template<int EltSize, int MaxBytes, bool Multimem, typename IntBytes>
+__device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
+    int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead
+  ) {
+  static_assert(std::is_signed<IntBytes>::value, "`IntBytes` must be a signed integral type.");
+  int nBytes = min(nBytesAhead, (IntBytes)MaxBytes);
+  int nFrontBytes = min(nBytes, (16 - int(dstAddr%16))%16);
+  int nMiddleBytes = (nBytes-nFrontBytes) & -16;
+  int nBackBytes = (nBytes-nFrontBytes) % 16;
+
+  { int backLane = WARP_SIZE-1 - lane;
+    bool hasFront = lane*EltSize < nFrontBytes;
+    bool hasBack = backLane*EltSize < nBackBytes;
+    int offset = hasFront ? lane*EltSize : (nBytes - (backLane+1)*EltSize);
+    if (hasFront | hasBack) {
+      BytePack<EltSize> tmp = ld_shared<EltSize>(srcAddr+offset);
+      // Can't use multimem_st since it doesn't support EltSize==2
+      st_global<EltSize>(dstAddr+offset, tmp);
+    }
+  }
+
+  srcAddr += nFrontBytes;
+  int srcMisalign = EltSize < 4 ? (srcAddr%4) : 0;
+  srcAddr += -srcMisalign + lane*16;
+  dstAddr += nFrontBytes + lane*16;
+  nMiddleBytes -= lane*16;
+  #pragma unroll
+  for (int u=0; u < divUp(MaxBytes, WARP_SIZE*16); u++) {
+    if (nMiddleBytes <= 0) break;
+    union {
+      BytePack<4> b4[4];
+      BytePack<16> b16;
+    };
+    b4[0] = ld_shared<4>(srcAddr + 0*4);
+    b4[1] = ld_shared<4>(srcAddr + 1*4);
+    b4[2] = ld_shared<4>(srcAddr + 2*4);
+    b4[3] = ld_shared<4>(srcAddr + 3*4);
+    if (srcMisalign != 0) {
+      BytePack<4> b4_4 = ld_shared<4>(srcAddr + 4*4);
+      b4[0].u32 = __funnelshift_r(b4[0].u32, b4[1].u32, srcMisalign*8);
+      b4[1].u32 = __funnelshift_r(b4[1].u32, b4[2].u32, srcMisalign*8);
+      b4[2].u32 = __funnelshift_r(b4[2].u32, b4[3].u32, srcMisalign*8);
+      b4[3].u32 = __funnelshift_r(b4[3].u32, b4_4.u32, srcMisalign*8);
+    }
+    if (Multimem) multimem_st_global<16>(dstAddr, b16);
+    else          st_global<16>(dstAddr, b16);
+
+    srcAddr += WARP_SIZE*16;
+    dstAddr += WARP_SIZE*16;
+    nMiddleBytes -= WARP_SIZE*16;
+  }
+}
+#else
+template<int EltSize, int MaxBytes, bool Multimem, typename IntBytes>
+__device__ __forceinline__ void copyGlobalShared_WarpUnrolled(
+    int lane, uintptr_t dstAddr, uint32_t srcAddr, IntBytes nBytesAhead
+  ) {
+  // nop
+}
+#endif
+
+#endif
--- a/rccl/src/collectives/device/primitives.h
+++ b/rccl/src/collectives/device/primitives.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_PRIMITIVES_H_
+#define NCCL_PRIMITIVES_H_
+
+#include <type_traits>
+#include "reduce_kernel.h" // for reduction funcs
+#include "common_kernel.h"
+#include "common.h"
+
+#define NCCL_SPINS_BEFORE_CHECK_ABORT 1000000
+
+#define barrier_by_group() do { \
+  if (nthreads == NCCL_MAX_NTHREADS) { \
+    __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier\ns_waitcnt lgkmcnt(0)"); \
+  } else { \
+    const int w = threadIdx.x/WARP_SIZE; \
+    const int wid = threadIdx.x%WARP_SIZE; \
+    __threadfence(); \
+    if (wid == 0) { \
+      barrier_next[w] += nthreads/WARP_SIZE; \
+      atomicAdd((unsigned long long *)barriers, 1); \
+      while (atomicAdd((unsigned long long *)barriers, 0) < barrier_next[w]) __builtin_amdgcn_s_sleep(1); \
+      __asm__ __volatile__("s_wakeup"); \
+    } \
+  } \
+} while (0)
+
+/* Protocol classes: ProtoSimple, ProtoLL, ProtoLL128
+ * We use these as template args to the Primtiives class instead of integral
+ * enums (e.g. NCCL_PROTO_LL) because for SIMPLE we need to carry a few extra
+ * numbers. Also these types hold methods which let us compute numbers important
+ * to how that protocol operates with a consistent interface so that our
+ * algorithm code can operate protocol parametrically.
+ */
+template<int SlicePerChunk_1, int StepPerSlice_1, int Unroll_1 = COLL_UNROLL, int MultimemSrcs_1 = 0, int MultimemDsts_1 = 0>
+struct ProtoSimple {
+  static constexpr int Id = NCCL_PROTO_SIMPLE;
+  static constexpr int SlicePerChunk = SlicePerChunk_1;
+  static constexpr int StepPerSlice = StepPerSlice_1;
+  static constexpr int Unroll = Unroll_1;
+  static constexpr int MultimemSrcs = MultimemSrcs_1;
+  static constexpr int MultimemDsts = MultimemDsts_1;
+
+  // Data bytes (no flags etc) in one step of the fifo queue.
+  __device__ static int calcBytePerStep() {
+    return ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS;
+  }
+  // Granularity of data bytes transferred per thread.
+  __device__ static int calcBytePerGrain() {
+    return sizeof(uint64_t); // Bogus value? Nobody queries this metric for simple.
+  }
+  // Group width is how many consecutive group values a subchannel occupies.
+  static constexpr int MaxGroupWidth = 1;
+};
+
+struct ProtoLL {
+  static constexpr int Id = NCCL_PROTO_LL;
+
+  // Data bytes (no flags etc) in one step of the fifo queue.
+  __device__ static int calcBytePerStep() {
+    return ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/2; // Half is data
+  }
+  // Granularity of data bytes transferred per thread.
+  __device__ static int calcBytePerGrain() {
+    return sizeof(uint64_t); // One 16-byte line has 8-bytes of data
+  }
+  // Group width is how many consecutive group values a subchannel occupies.
+  static constexpr int MaxGroupWidth = 1;
+};
+
+struct ProtoLL128 {
+  static constexpr int Id = NCCL_PROTO_LL128;
+
+  // Data bytes (no flags etc) in one step of the fifo queue.
+  __device__ static int calcBytePerStep() {
+    return (ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS)*NCCL_LL128_DATAELEMS/NCCL_LL128_LINEELEMS;
+  }
+  // Granularity of data bytes transferred per thread.
+  __device__ static int calcBytePerGrain() {
+    return NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_DATAELEMS*sizeof(uint64_t)/NCCL_LL128_LINEELEMS;
+  }
+  // Group width is how many consecutive group values a subchannel occupies.
+  static constexpr int MaxGroupWidth = 1;
+};
+
+/* Fan (as in fan-in & fan-out) classes hold recv and send counts. The template
+ * arguments are static bounds on the maximum values. Asymmetric counts are
+ * independent. Symmetric is a static guarantee that nrecv==nsend, so it only
+ * stores one value at runtime. This optimization save 32-bit register, but more
+ * importantly uses fewer predicate registers when unrolling loops.
+ */
+template<int MaxRecv_, int MaxSend_>
+struct FanAsymmetric {
+  static constexpr int MaxRecv = MaxRecv_, MaxSend = MaxSend_;
+  int nr, ns;
+  FanAsymmetric() = default;
+  __device__ FanAsymmetric(int nrecv, int nsend): nr(nrecv), ns(nsend) {
+    // assert(nrecv <= MaxRecv && nsend <= MaxSend);
+  }
+  __device__ int nrecv() const { return MaxRecv ? nr : 0; }
+  __device__ int nsend() const { return MaxSend ? ns : 0; }
+};
+
+template<int MaxArity>
+struct FanSymmetric {
+  static constexpr int MaxRecv = MaxArity, MaxSend = MaxArity;
+  int n;
+  FanSymmetric() = default;
+  __device__ FanSymmetric(int nrecv, int nsend): n(nrecv) {
+    // assert(nrecv == nsend && nrecv <= MaxArity);
+  }
+  __device__ int nrecv() const { return n; }
+  __device__ int nsend() const { return n; }
+};
+
+// The primitives class. Specialized per protocol in the other headers.
+template<typename T, typename RedOp, typename Fan, int Direct, typename Proto, int P2p>
+class Primitives;
+
+// Used by LL & LL128 to implement direct members in the naive way.
+template<typename RealPrimitives>
+struct PrimitivesWithoutDirect {
+  __device__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
+    static_cast<RealPrimitives*>(this)->send(inpIx, eltN);
+  }
+  __device__ void directSendFromOutput(intptr_t outIx, int eltN) {
+    static_cast<RealPrimitives*>(this)->sendFromOutput(outIx, eltN);
+  }
+  __device__ void directRecv(intptr_t outIx, int eltN) {
+    static_cast<RealPrimitives*>(this)->recv(outIx, eltN, /*postOp=*/false);
+  }
+  __device__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    static_cast<RealPrimitives*>(this)->copySend(inpIx, outIx, eltN, postOp);
+  }
+  __device__ void directRecvCopySend(intptr_t outIx, int eltN) {
+    static_cast<RealPrimitives*>(this)->recvCopySend(outIx, eltN, /*postOp=*/false);
+  }
+  __device__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    // Direct is only for the send part
+    static_cast<RealPrimitives*>(this)->recvReduceCopySend(inpIx, outIx, eltN, postOp);
+  }
+};
+
+#include "prims_simple.h"
+#include "prims_ll.h"
+#include "prims_ll128.h"
+#endif
--- a/rccl/src/collectives/device/prims_ll.h
+++ b/rccl/src/collectives/device/prims_ll.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#if defined(ENABLE_NPKIT)
+#include "npkit/npkit.h"
+#endif
+
+#ifdef __GFX11__
+#define LL_STORE(SRC, DST) \
+  __atomic_store_n((DST), (SRC), __ATOMIC_RELAXED)
+#define LL_LOAD(SRC) \
+  __atomic_load_n(SRC, __ATOMIC_RELAXED)
+#else
+#define LL_STORE(SRC, DST) \
+  __builtin_nontemporal_store((SRC), (DST))
+#define LL_LOAD(SRC) \
+  __builtin_nontemporal_load(SRC)
+#endif
+
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL, P2p>> {
+
+  // In the case of Fan::MaxRecv == 0, we need to force MaxRecv to 1 for this to compile
+  // This is because of a recv buffer which is allocated to MaxRecv length in send-only cases
+  static constexpr int MaxRecv = Fan::MaxRecv > 1 ? Fan::MaxRecv : 1;
+  static constexpr int MaxSend = Fan::MaxSend;
+  static constexpr int Input=0, Output=1;
+  RedOp redOp;
+  const int tid;
+  const int nthreads;
+  const int wid;
+  const int group;
+  const int stepLines;
+  Fan fan;
+  T *userBufs[2];
+  struct ncclConnInfo* recvConn = NULL;
+  volatile uint64_t* recvConnHeadPtr = NULL;
+  uint64_t recvConnHead;
+
+  struct ncclConnInfo* sendConn = NULL;
+  volatile int* sendConnFifoPtr = NULL;
+  volatile uint64_t* sendConnHeadPtr = NULL;
+  uint64_t sendConnHead;
+  uint64_t sendConnHeadCache; // Cache last seen value
+
+  uint64_t recvStep[MaxRecv];
+  uint64_t sendStep[MaxSend];
+  union ncclLLFifoLine* recvBuff[MaxRecv];
+  union ncclLLFifoLine* sendBuff[MaxSend];
+
+#ifdef HYGON_SDMA_FEATURE
+public:
+  uint32_t ringIx;
+  uint32_t useSdmaCopy;
+  uint32_t sdmaMinCopySize;
+  uint32_t sdmaCountEnabe;
+  uint32_t sdmaCopyCount;
+  uint32_t allCopyCount;
+private:
+#endif
+
+#if defined(ENABLE_NPKIT)
+public:
+  int npKitCtxIdx = 0;
+  uint64_t npKitDataProcessEntryTime = 0;
+  uint64_t npKitDataProcessExitTime = 0;
+  uint64_t npKitDataProcessTotalTime = 0;
+private:
+#endif
+
+#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
+  uint64_t npKitWaitRecvDataProcessSize = 0;
+  uint64_t npKitWaitRecvEntryTime = 0;
+  uint64_t npKitWaitRecvExitTime = 0;
+  uint64_t npKitWaitRecvTotalTime = 0;
+#endif
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepLines; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepLines; }
+  inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); }
+  inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); }
+
+  uint64_t* barriers;
+  uint64_t* barrier_next;
+
+  inline __device__ void barrier() {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    if (nthreads != WARP_SIZE)
+      barrier_by_group();
+#else
+    asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
+#endif
+  }
+
+  uint32_t abort = 0;
+
+  inline __device__ int checkAbort(int &spins, int send) {
+    spins++;
+    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
+      abort = __atomic_load_n((ncclShmem.comm.abortFlag), __ATOMIC_SEQ_CST);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  inline __device__ void waitSend(int nbytes) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY, nbytes, 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+    if (sendConnHeadPtr) {
+      int spins = 0;
+      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
+        __builtin_amdgcn_s_sleep(1);
+        sendConnHeadCache = atomicAdd((unsigned long long *)sendConnHeadPtr, 0);
+        if (checkAbort(spins, 1)) break;
+      }
+      __asm__ __volatile__("s_wakeup");
+      if (sendConnFifoPtr) {
+        int size = ((sendConnHead & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? stepLines*sizeof(union ncclLLFifoLine) : nbytes;
+        __atomic_store_n(sendConnFifoPtr+sendConnHead%NCCL_STEPS, (size), __ATOMIC_SEQ_CST);
+      }
+      sendConnHead += 1;
+    }
+    barrier();
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT, nbytes, 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+  }
+
+  inline __device__ void incRecv(int i) {
+    recvStep[i] += 1;
+  }
+  inline __device__ void postRecv() {
+    barrier();
+    if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += 1);
+  }
+
+  inline __device__ void incSend(int i, int offset) {
+    // LL Cleanup : write all flags in the slice to make sure we don't have
+    // data corruption when flag loops over.
+    if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) {
+      for (int o = offset; o<stepLines; o+=nthreads) storeLL(sendPtr(i)+o, 0, sendFlag(i));
+    }
+    sendStep[i]++;
+  }
+
+  __device__ uint64_t readLL(int offset, int i) {
+    union ncclLLFifoLine* src = recvPtr(i) + offset;
+    uint32_t flag = recvFlag(i);
+    uint32_t data1, flag1, data2, flag2;
+    int spins = 0;
+
+#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
+    int npkitWaitRecvSpins = 0;
+    if (tid == 0) {
+      npKitWaitRecvEntryTime = NPKIT_GET_GPU_TIMESTAMP();
+    }
+#endif
+
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    union ncclLLFifoLine i4;
+    do {
+      i4.v[0] = LL_LOAD(src->v);
+      i4.v[1] = LL_LOAD(src->v+1);
+#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
+      npkitWaitRecvSpins++;
+#endif
+      if (checkAbort(spins, 0)) break;
+    } while ((i4.flag1 != flag) || (i4.flag2 != flag));
+    uint64_t val64 = (uint64_t)(i4.data1) + (((uint64_t)i4.data2) << 32);
+#else
+    do {
+      asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4));
+#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
+      npkitWaitRecvSpins++;
+#endif
+      if (checkAbort(spins, 0)) break;
+    } while ((flag1 != flag) || (flag2 != flag));
+    uint64_t val64 = data1 + (((uint64_t)data2) << 32);
+#endif
+
+#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
+    if (tid == 0) {
+      npKitWaitRecvExitTime = NPKIT_GET_GPU_TIMESTAMP();
+      npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
+    }
+#endif
+
+    return val64;
+  }
+
+  template<int BeginIx>
+  __device__ void readLLBeginAll(int offset, ncclLLFifoLine(&line)[MaxRecv]) {
+    #pragma unroll
+    for (int i=BeginIx; i < MaxRecv; i++) {
+      if (i < fan.nrecv()) {
+        union ncclLLFifoLine* src = recvPtr(i) + offset;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+        line[i].v[0] = LL_LOAD(src->v);
+        line[i].v[1] = LL_LOAD(src->v+1);
+#else
+        asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
+#endif
+      }
+    }
+  }
+  __device__ uint64_t readLLFinish(int offset, ncclLLFifoLine(&line)[MaxRecv], int i) {
+    union ncclLLFifoLine* src = recvPtr(i) + offset;
+    uint32_t flag = recvFlag(i);
+    int spins = 0;
+
+#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
+    int npkitWaitRecvSpins = 0;
+    if (tid == 0) {
+      npKitWaitRecvEntryTime = NPKIT_GET_GPU_TIMESTAMP();
+    }
+#endif
+
+    do {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+      line[i].v[0] = LL_LOAD(src->v);
+      line[i].v[1] = LL_LOAD(src->v+1);
+#else
+      asm("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(line[i].data1), "=r"(line[i].flag1), "=r"(line[i].data2), "=r"(line[i].flag2) : "l"(&src->i4));
+#endif
+#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
+      npkitWaitRecvSpins++;
+#endif
+      if (checkAbort(spins, 0)) break;
+    } while(line[i].flag1 != flag || line[i].flag2 != flag);
+    uint64_t val64 = line[i].data1 + (((uint64_t)line[i].data2) << 32);
+
+#if defined(ENABLE_NPKIT) && (defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT) || defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME))
+    if (tid == 0) {
+      npKitWaitRecvExitTime = NPKIT_GET_GPU_TIMESTAMP();
+      npKitWaitRecvTotalTime += (npKitWaitRecvExitTime - npKitWaitRecvEntryTime) * (npkitWaitRecvSpins - 1) / npkitWaitRecvSpins;
+    }
+#endif
+
+    return val64;
+  }
+
+  __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    union ncclLLFifoLine i4;
+    i4.data1 = val & 0xffffffff;
+    i4.flag1 = flag;
+    i4.data2 = (val >> 32);
+    i4.flag2 = flag;
+    LL_STORE(i4.v[0], dst->v);
+    LL_STORE(i4.v[1], dst->v+1);
+#else
+    asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag));
+#endif
+  }
+
+  static constexpr int EltPerLine = sizeof(uint64_t)/sizeof(T);
+
+  template<typename U>
+  __device__ static U load(U *src) {
+    union {
+      U elt;
+      uint8_t u1;
+      uint16_t u2;
+      uint32_t u4;
+      uint64_t u8;
+    };
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    if(sizeof(U) == 1)
+      u1 = LL_LOAD((uint8_t*)src);
+    else if(sizeof(U) == 2)
+      u2 = LL_LOAD((uint16_t*)src);
+    else if(sizeof(U) == 4)
+      u4 = LL_LOAD((uint32_t*)src);
+    else
+      u8 = LL_LOAD((uint64_t*)src);
+#else
+    if(sizeof(U) == 1)
+      asm("ld.volatile.global.b8 %0,[%1];" : "=r"(u4) : "l"(src));
+    else if(sizeof(U) == 2)
+      asm("ld.volatile.global.b16 %0,[%1];" : "=h"(u2) : "l"(src));
+    else if(sizeof(U) == 4)
+      asm("ld.volatile.global.b32 %0,[%1];" : "=r"(u4) : "l"(src));
+    else
+      asm("ld.volatile.global.b64 %0,[%1];" : "=l"(u8) : "l"(src));
+#endif
+    return elt;
+  }
+
+  template<typename U>
+  __device__ static void store(U *dst, U val) {
+    union {
+      U elt;
+      uint8_t u1;
+      uint16_t u2;
+      uint32_t u4;
+      uint64_t u8;
+    };
+    elt = val;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+    if(sizeof(U) == 1)
+      LL_STORE(u1, (uint8_t*)dst);
+    else if(sizeof(U) == 2)
+      LL_STORE(u2, (uint16_t*)dst);
+    else if(sizeof(U) == 4)
+      LL_STORE(u4, (uint32_t*)dst);
+    else
+      LL_STORE(u8, (uint64_t*)dst);
+#else
+    if(sizeof(U) == 1)
+      asm("st.volatile.global.b8 [%0],%1;" :: "l"(dst), "r"(u4));
+    else if(sizeof(U) == 2)
+      asm("st.volatile.global.b16 [%0],%1;" :: "l"(dst), "h"(u2));
+    else if(sizeof(U) == 4)
+      asm("st.volatile.global.b32 [%0],%1;" :: "l"(dst), "r"(u4));
+    else
+      asm("st.volatile.global.b64 [%0],%1;" :: "l"(dst), "l"(u8));
+#endif
+  }
+
+  struct DataLoader {
+    int misalign;
+    union {
+      uint32_t u4[sizeof(T) <= 2 ? 3 : 2];
+      uint64_t u8;
+      T elt[EltPerLine];
+    };
+
+    __device__ void loadBegin(T *src, int eltN) {
+      if (sizeof(T) <= 2) {
+        misalign = reinterpret_cast<uintptr_t>(src)%4;
+        uint32_t *p = reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(src) & -uintptr_t(4));
+        u4[0] = load(p+0);
+        u4[1] = misalign + eltN*sizeof(T) > 4 ? load(p+1) : 0;
+        // u4[2] would be simpler, but that throws warnings on some compilers
+        u4[sizeof(T) <= 2 ? 2 : 0] = misalign + eltN*sizeof(T) > 8 ? load(p+2) : 0;
+      }
+      else {
+        #pragma unroll
+        for(int i=0; i < EltPerLine; i++) {
+          if(i==0 || i < eltN)
+            elt[i] = load(src + i);
+        }
+      }
+    }
+
+    __device__ uint64_t loadFinish() {
+      if (sizeof(T) <= 2) {
+        u4[0] = __funnelshift_r(u4[0], u4[1], 8*misalign);
+        // u4[2] would be simpler, but that throws warnings on some compilers
+        u4[1] = __funnelshift_r(u4[1], u4[sizeof(T) <= 2 ? 2 : 0], 8*misalign);
+      }
+      return u8;
+    }
+  };
+
+  __device__ void storeData(T *dst, uint64_t val, int eltN) {
+    union {
+      uint64_t u8;
+      T elt[EltPerLine];
+    };
+    u8 = val;
+    #pragma unroll
+    for(int i=0; i < EltPerLine; i++) {
+      if (i==0 || i < eltN)
+        //store(dst+i, elt[i]);
+        dst[i] = elt[i];
+    }
+  }
+
+  __device__ void mscclStoreData(T *dst, uint64_t val, int eltN) {
+    union {
+      uint64_t u8;
+      T elt[EltPerLine];
+    };
+    u8 = val;
+    #pragma unroll
+    for(int i=0; i < EltPerLine; i++) {
+      if (i==0 || i < eltN)
+        store(dst+i, elt[i]);
+        // dst[i] = elt[i];
+    }
+  }
+
+  template <int RECV, int SEND, int SrcBuf, int DstBuf>
+  __device__ void LLGenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
+    constexpr int SRC = SrcBuf != -1 ? 1 : 0;
+    constexpr int DST = DstBuf != -1 ? 1 : 0;
+    T *srcElts = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
+    T *dstElts = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx;
+
+    // Always waitSend in case of cleanup
+    nelem = nelem < 0 ? 0 : nelem;
+    if (SEND) waitSend(divUp(nelem, EltPerLine)*sizeof(ncclLLFifoLine));
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
+    if (tid == 0) {
+      npKitWaitRecvTotalTime = 0;
+      npKitWaitRecvDataProcessSize = nelem*sizeof(T);
+      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY,
+          npKitWaitRecvDataProcessSize, 0, NPKIT_GET_GPU_TIMESTAMP(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+    if (tid == 0) {
+      npKitWaitRecvTotalTime = 0;
+      npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
+    }
+#endif
+
+    nelem -= tid*EltPerLine;
+    srcElts += tid*EltPerLine;
+    dstElts += tid*EltPerLine;
+    int offset = tid;
+    int eltPerTrip = nthreads*EltPerLine;
+    while (nelem > 0) {
+      int eltInLine = EltPerLine < nelem ? EltPerLine : nelem;
+
+      DataLoader dl;
+      ncclLLFifoLine line[MaxRecv];
+      uint64_t data, peerData;
+      if (SRC) {
+        dl.loadBegin(srcElts, eltInLine);
+        srcElts += eltPerTrip;
+      }
+      if (RECV) {
+        readLLBeginAll<1>(offset, line);
+        peerData = readLL(offset, 0);
+      }
+      if (SRC) {
+        data = dl.loadFinish();
+        if (SrcBuf == Input) data = applyPreOp(redOp, data);
+      }
+      if (RECV) {
+        data = !SRC ? peerData : applyReduce(redOp, peerData, data);
+        #pragma unroll MaxRecv
+        for (int i=1; i < MaxRecv && i < fan.nrecv(); i++) {
+          peerData = readLLFinish(offset, line, i);
+          data = applyReduce(redOp, peerData, data);
+        }
+      }
+
+      if (postOp) data = applyPostOp(redOp, data);
+
+      // Send : inter-node, then intra-node, then local
+      if (SEND) {
+        for (int i=1; i < MaxSend && i < fan.nsend(); i++)
+          storeLL(sendPtr(i)+offset, data, sendFlag(i));
+        storeLL(sendPtr(0)+offset, data, sendFlag(0));
+      }
+      if (DST) {
+        storeData(dstElts, data, eltInLine);
+        dstElts += eltPerTrip;
+      }
+      nelem -= eltPerTrip;
+      offset += nthreads;
+    }
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+    if (tid == 0) {
+      npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
+      npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime - npKitWaitRecvTotalTime;
+    }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY) && defined(ENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT,
+          npKitWaitRecvDataProcessSize, npKitWaitRecvTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+    if (RECV) {
+      for (int i=0; i < MaxRecv; i++) incRecv(i);
+      postRecv();
+    }
+    if (SEND) {
+      for (int i=1; i < MaxSend && i < fan.nsend(); i++)
+        incSend(i, offset);
+      incSend(0, offset);
+    }
+  }
+
+  template <int REDUCE, int COPY, int MULTISRCS, int MULTIDSTS>
+  __device__ __forceinline__ void mscclGenericOp(T** srcs, int nsrcs, T** dsts, int ndsts, int nelem) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+    nelem = nelem < 0 ? 0 : nelem;
+    T *srcElts = srcs[0];
+    T *dstElts = dsts[0];
+    nelem -= tid*EltPerLine;
+    srcElts += tid*EltPerLine;
+    dstElts += tid*EltPerLine;
+    if (MULTISRCS){
+      for (int i = 1; i < nsrcs; i++){
+        srcs[i] += tid*EltPerLine;
+      }
+    }
+    if (MULTIDSTS){
+      for (int i = 1; i < ndsts; i++){
+        dsts[i] += tid*EltPerLine;
+      }
+    }
+    int offset = tid;
+    int eltPerTrip = nthreads*EltPerLine;
+    while (nelem > 0) {
+      int eltInLine = EltPerLine < nelem ? EltPerLine : nelem;
+
+      DataLoader dl;
+      uint64_t data;
+      dl.loadBegin(srcElts, eltInLine);
+      srcElts += eltPerTrip;
+      data = dl.loadFinish();
+      if (REDUCE) {
+        uint64_t dataD;
+        dl.loadBegin(dstElts, eltInLine);
+        dataD = dl.loadFinish();
+        dataD = applyReduce(redOp, dataD, data);
+        if (MULTISRCS){
+          for (int i = 1; i < nsrcs; i++){
+            dl.loadBegin(srcs[i], eltInLine);
+            srcs[i] += eltPerTrip;
+            data = dl.loadFinish();
+            dataD = applyReduce(redOp, dataD, data);
+          }
+        }
+        mscclStoreData(dstElts, dataD, eltInLine);
+        dstElts += eltPerTrip;
+      }
+      if (COPY){
+        mscclStoreData(dstElts, data, eltInLine);
+        dstElts += eltPerTrip;
+        if (MULTIDSTS){
+          for (int i = 1; i < ndsts; i++){
+            dl.loadBegin(srcs[i], eltInLine);
+            srcs[i] += eltPerTrip;
+            data = dl.loadFinish();
+            mscclStoreData(dsts[i], data, eltInLine);
+            dsts[i] += eltPerTrip;
+          }
+        }
+      }
+      nelem -= eltPerTrip;
+      offset += nthreads;
+    }
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+    barrier();
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL];
+    recvStep[i] = conn->step;
+    if (wid == i) recvConn = conn;
+  }
+  __device__ __forceinline__ void loadRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) {
+      recvConnHeadPtr = recvConn->head;
+      recvConnHead = recvConn->step;
+    }
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendBuff[i] = (union ncclLLFifoLine*)conn->buffs[NCCL_PROTO_LL];
+    sendStep[i] = conn->step;
+    if (wid == i) sendConn = conn;
+  }
+  __device__ __forceinline__ void loadSendSync() {
+    if (tid < fan.nsend()) {
+      sendConnHeadPtr = sendConn->head;
+      sendConnHeadCache = *sendConnHeadPtr;
+      sendConnHead = sendConn->step;
+      sendConnFifoPtr = sendConn->sizesFifo;
+    }
+  }
+
+ public:
+  __device__  Primitives(
+      const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
+      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0
+    ):
+    redOp(redOpArg),
+    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), group(group),
+    stepLines(ncclShmem.comm.buffSizes[NCCL_PROTO_LL]/NCCL_STEPS/sizeof(ncclLLFifoLine)) {
+    auto *channel = &ncclShmem.channel;
+    barriers = &ncclShmem.groups[group].barrier;
+    barrier_next = ncclShmem.groups[group].barrier_next;
+    // If we are going to support oneshot collNet + LL, then we would need to add connector index here
+    int nrecv=0, nsend=0;
+    // We compare with Fan::MaxRecv here because this->MaxRecv is always at least 1
+    while (nrecv < Fan::MaxRecv && recvPeers[nrecv] >= 0) {
+      loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
+      nrecv++;
+    }
+    while (nsend < MaxSend && sendPeers[nsend] >= 0) {
+      loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
+      nsend++;
+    }
+    this->fan = Fan(nrecv, nsend);
+    loadRecvSync();
+    loadSendSync();
+    setDataPtrs(inputBuf, outputBuf);
+  }
+
+  __device__ ~Primitives() {
+    // Save steps for the next operation
+    if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv())
+      recvConn->step = recvConnHead;
+    if (tid < fan.nsend())
+      sendConn->step = sendConnHead;
+    // Ensure all steps written back
+    barrier();
+  }
+
+  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf) {
+    userBufs[Input] = (T*)inputBuf;
+    userBufs[Output] = (T*)outputBuf;
+  }
+
+  __device__ void moveDataPtrs(intptr_t delta) {
+    userBufs[Input] += delta;
+    userBufs[Output] += delta;
+  }
+
+  __device__ void send(intptr_t inpIx, int eltN) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+    LLGenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+  }
+  __device__ void sendFromOutput(intptr_t outIx, int eltN) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+    LLGenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+  }
+  __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+    LLGenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+  }
+  __device__ void recvReduceSend(intptr_t inpIx, int eltN) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+    LLGenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+  }
+  __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+    LLGenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+  }
+  __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+    LLGenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_COPY_SEND_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_COPY_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+  }
+  __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+    LLGenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_COPY_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+  }
+  __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+    LLGenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT, eltN*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+  }
+  __device__ void recvSend(int eltN) {
+    return LLGenericOp<1, 1, -1, -1>(-1, -1, eltN, false);
+  }
+
+  // MSCCL primitives
+  __device__ void sendWithBarrier(intptr_t inpIx, int eltN) {
+    send(inpIx, eltN);
+    // This is the only primitive.instruction where there is no barrier at the end, add it
+    barrier();
+  }
+  __device__ void localCopy(T* srcs, T* dsts, int eltN) {
+    return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
+  }
+  __device__ void reduce(T** srcs, int nsrcs, T** dsts, int ndsts, int eltN) {
+    if (nsrcs == 1) {
+      return mscclGenericOp<1,0,0,0>(srcs, 1, dsts, 1, eltN);
+    } else {
+      return mscclGenericOp<1,0,1,0>(srcs, nsrcs, dsts, 1, eltN);
+    }
+  }
+};
--- a/rccl/src/collectives/device/prims_ll128.h
+++ b/rccl/src/collectives/device/prims_ll128.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#if defined(ENABLE_NPKIT)
+#include "npkit/npkit.h"
+#endif
+
+#define NCCL_LL128_FLAGTHREAD (NCCL_LL128_LINEELEMS-1)
+
+#ifndef RCCL_USE_WBINVL1_VOL
+#if defined(__GFX8__) || defined(__GFX9__)
+#define RCCL_USE_WBINVL1_VOL 1
+#else
+#define RCCL_USE_WBINVL1_VOL 0
+#endif
+#endif
+
+template<typename T, typename RedOp, typename Fan, int Direct, int P2p>
+class Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>:
+  public PrimitivesWithoutDirect<Primitives<T, RedOp, Fan, Direct, ProtoLL128, P2p>> {
+
+  static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
+  static constexpr int Input=0, Output=1;
+  RedOp redOp;
+  const int tid;
+  const int nthreads;
+  const int wid;
+  const int stepSize;
+  const int warp;
+  const int warpInBlock; // warp index in thread block
+  const bool flagThread;
+  const int group;
+  Fan fan;
+  T *userBufs[2];
+  struct ncclConnInfo* recvConn = NULL;
+  volatile uint64_t* recvConnHeadPtr = NULL;
+  uint64_t recvConnHead;
+
+  struct ncclConnInfo* sendConn = NULL;
+  volatile int* sendConnFifoPtr = NULL;
+  volatile uint64_t* sendConnTailPtr = NULL;
+  uint64_t sendConnTail;
+  volatile uint64_t* sendConnHeadPtr = NULL;
+  uint64_t sendConnHead;
+  uint64_t sendConnHeadCache; // Cache last seen value
+
+  uint64_t recvStep[MaxRecv];
+  uint64_t sendStep[MaxSend];
+  uint64_t* recvBuff[MaxRecv];
+  uint64_t* sendBuff[MaxSend];
+
+  inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; }
+  inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; }
+  inline __device__ uint64_t* recvPtr(int i) { return recvBuff[i]+recvOffset(i); }
+  inline __device__ uint64_t* sendPtr(int i) { return sendBuff[i]+sendOffset(i); }
+  inline __device__ uint64_t recvFlag(int i) { return recvStep[i]+1; }
+  inline __device__ uint64_t sendFlag(int i) { return sendStep[i]+1; }
+
+  uint64_t* barriers;
+  uint64_t* barrier_next;
+
+#ifdef HYGON_SDMA_FEATURE
+public:
+  uint32_t ringIx;
+  uint32_t useSdmaCopy;
+  uint32_t sdmaMinCopySize;
+  uint32_t sdmaCountEnabe;
+  uint32_t sdmaCopyCount;
+  uint32_t allCopyCount;
+private:
+#endif
+
+#if defined(ENABLE_NPKIT)
+public:
+  int npKitCtxIdx = 0;
+  uint64_t npKitDataProcessEntryTime = 0;
+  uint64_t npKitDataProcessExitTime = 0;
+  uint64_t npKitDataProcessTotalTime = 0;
+private:
+#endif
+
+  inline __device__ void barrier() {
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  if (nthreads != WARP_SIZE)
+    barrier_by_group();
+#else
+   asm volatile ("bar.sync %1, %0;" :: "r"(nthreads), "r"(15-group));
+#endif
+  }
+
+  uint32_t abort = 0;
+  uint32_t* sync;
+
+  inline __device__ int checkAbort(int &spins, int i, int send) {
+    spins++;
+    if (abort == 0 && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
+      abort = __atomic_load_n(ncclShmem.comm.abortFlag, __ATOMIC_SEQ_CST);
+      spins = 0;
+    }
+    return abort;
+  }
+
+  inline __device__ void waitSend(int nbytes) {
+    if (sendConnHeadPtr) {
+      int spins = 0;
+      while (sendConnHeadCache + NCCL_STEPS < sendConnHead + 1) {
+        __builtin_amdgcn_s_sleep(1);
+        sendConnHeadCache = atomicAdd_system((unsigned long long *)sendConnHeadPtr, 0);
+        if (checkAbort(spins, wid, 1)) break;
+      }
+      __asm__ __volatile__("s_wakeup");
+      if (sendConnFifoPtr) {
+        __atomic_store_n(sendConnFifoPtr+sendStep[wid]%NCCL_STEPS, nbytes, __ATOMIC_SEQ_CST);
+      }
+      sendConnHead += 1;
+    }
+  }
+
+  inline __device__ void postRecv() {
+    if (recvConnHeadPtr) STORE(recvConnHeadPtr, recvConnHead += 1);
+  }
+  inline __device__ void postSend() {
+    if (sendConnTailPtr) { __threadfence(); STORE((unsigned long long *)sendConnTailPtr, sendConnTail += 1); }
+  }
+
+  template<int WordPerThread>
+  __device__ __forceinline__ void loadRegsBegin(uint64_t(&regs)[WordPerThread], T const *src, int eltN) {
+    constexpr int EltPer16B = 16/sizeof(T);
+    /* We are aligned to 16 bytes, so load directly to registers no shmem.
+     * Flag threads load half as much data which gets shuffled to the even
+     * registers during Finish. The point of splitting into two phases is to
+     * defer that shuffle, which incurs a dependency stall, until after other
+     * memops are launched by the caller.
+     */
+    #pragma unroll
+    for(int g=0; g < WordPerThread/2; g++) {
+      int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4);
+      if(!flagThread || g%2==0) {
+        if(ix*EltPer16B < eltN) {
+          if(reinterpret_cast<uintptr_t>(src)%4 == 0) {
+            regs[2*g+0] = __builtin_nontemporal_load((uint64_t*)(src + ix*EltPer16B));
+            regs[2*g+1] = __builtin_nontemporal_load((uint64_t*)(src + ix*EltPer16B)+1);
+          } else {
+            union {
+              uint64_t regs64[WordPerThread];
+              uint32_t regs32[WordPerThread*2];
+              uint16_t regs16[WordPerThread*4];
+              uint8_t regs8[WordPerThread*8];
+            };
+            if (sizeof(T) == 8) {
+              uint64_t *src64 = (uint64_t*)(src+ix*EltPer16B);
+              for (int i=0; i < 2; i++)
+                regs64[2*g+i] = __builtin_nontemporal_load(src64+i);
+            } else if (sizeof(T) == 4) {
+              uint32_t *src32 = (uint32_t*)(src+ix*EltPer16B);
+              for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T); i++)
+                regs32[2*g+i] = __builtin_nontemporal_load(src32+i);
+            } else if (sizeof(T) == 2) {
+              uint16_t *src16 = (uint16_t*)(src+ix*EltPer16B);
+              for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T); i++)
+                regs16[2*g+i] = __builtin_nontemporal_load(src16+i);
+            } else if (sizeof(T) == 1) {
+              uint8_t *src8 = (uint8_t*)(src+ix*EltPer16B);
+              for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T); i++)
+                regs8[2*g+i] = __builtin_nontemporal_load(src8+i);
+            }
+            regs[2*g+0] = regs64[2*g+0];
+            regs[2*g+1] = regs64[2*g+1];
+          }
+        }
+      }
+    }
+  }
+
+  template<int WordPerThread>
+  __device__ __forceinline__ void loadRegsFinish(uint64_t(&regs)[WordPerThread]) {
+    // Move data out of flag registers into the vacant registers.
+    #pragma unroll
+    for (int g=1; g < WordPerThread/2; g+=2) {
+      if (flagThread) regs[2*g] = regs[2*g-1];
+    }
+  }
+
+  template<int WordPerThread>
+  __device__ __forceinline__ void storeRegs(T *dst, uint64_t(&regs)[WordPerThread], int eltN) {
+    constexpr int EltPer16B = 16/sizeof(T);
+    // Reverse Finish() register permuatation.
+    #pragma unroll
+    for (int g=1; g < WordPerThread/2; g+=2) {
+      if (flagThread) regs[2*g-1] = regs[2*g];
+    }
+    // Write to dst if 4-byte aligned, shmem otherwise.
+    int misalignment = reinterpret_cast<uintptr_t>(dst)%4;
+    #pragma unroll
+    for(int g=0; g < WordPerThread/2; g++) {
+      int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4);
+      if (!flagThread || g%2==0) {
+        if(misalignment == 0 && (ix+1)*EltPer16B <= eltN) {
+          __builtin_nontemporal_store(regs[2*g+0], (uint64_t*)(dst + ix*EltPer16B));
+          __builtin_nontemporal_store(regs[2*g+1], (uint64_t*)(dst + ix*EltPer16B)+1);
+        } else {
+          union {
+            uint64_t regs64[WordPerThread];
+            uint32_t regs32[WordPerThread*2];
+            uint16_t regs16[WordPerThread*4];
+            uint8_t regs8[WordPerThread*8];
+          };
+          regs64[2*g+0] = regs[2*g+0];
+          regs64[2*g+1] = regs[2*g+1];
+          int remaining = eltN - ix*EltPer16B;
+          if (sizeof(T) == 8) {
+            uint64_t *dst64 = (uint64_t*)(dst+ix*EltPer16B);
+            for (int i=0; i < 2 && i < remaining; i++)
+              __builtin_nontemporal_store(regs64[2*g+i], dst64+i);
+          } else if (sizeof(T) == 4) {
+            uint32_t *dst32 = (uint32_t*)(dst+ix*EltPer16B);
+            for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T) && i < remaining; i++)
+              __builtin_nontemporal_store(regs32[2*g+i], dst32+i);
+          } else if (sizeof(T) == 2) {
+            uint16_t *dst16 = (uint16_t*)(dst+ix*EltPer16B);
+            for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T) && i < remaining; i++)
+              __builtin_nontemporal_store(regs16[2*g+i], dst16+i);
+          } else if (sizeof(T) == 1) {
+            uint8_t *dst8 = (uint8_t*)(dst+ix*EltPer16B);
+            for (int i=0; i < 2*sizeof(uint64_t)/sizeof(T) && i < remaining; i++)
+              __builtin_nontemporal_store(regs8[2*g+i], dst8+i);
+          }
+        }
+      }
+    }
+  }
+
+  #define WARP_MASK 0xffffffff
+
+  template <int ELEMS_PER_THREAD, int RECV, int SEND, int SrcBuf, int DstBuf>
+  __device__ __forceinline__ void recvReduceSendCopy(uint64_t(&v)[ELEMS_PER_THREAD], int ll128Offset, bool postOp) {
+    constexpr int SRC = SrcBuf != -1 ? 1 : 0;
+    uint64_t vr[ELEMS_PER_THREAD];
+
+    __syncwarp();
+    /************************ Wait first recv ********************/
+    if (RECV) {
+      uint64_t* ptr = recvPtr(0)+ll128Offset;
+      uint64_t flag = recvFlag(0);
+      bool needReload;
+      int spins = 0;
+      do {
+        needReload = false;
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          vr[u] = __builtin_nontemporal_load(ptr+u*WARP_SIZE);
+          vr[u+1] = __builtin_nontemporal_load(ptr+u*WARP_SIZE+1);
+          needReload |= flagThread && (vr[u+1] != flag);
+        }
+        needReload &= (0 == checkAbort(spins, 0, 0));
+      } while (__any(needReload));
+    }
+
+    /************* Finish register load **************/
+    if (SRC) {
+      // By deferring register shuffle here we've overlapped spinning on first
+      // peer's data with memory loads of src data.
+      loadRegsFinish(v);
+      if (SrcBuf == Input) {
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          v[u] = applyPreOp(redOp, v[u]);
+          if (!flagThread)
+            v[u+1] = applyPreOp(redOp, v[u+1]);
+        }
+      }
+    }
+
+    /************************ Recv rest *********************/
+    if (RECV) {
+      { // Consume data from first recv
+        uint64_t* ptr = recvPtr(0)+ll128Offset;
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          v[u]   = SRC ? applyReduce(redOp, vr[u], v[u]) : vr[u];
+          v[u+1] = SRC ? applyReduce(redOp, vr[u+1], v[u+1]) : vr[u+1];
+        }
+      }
+
+      for (int i=1; i<MaxRecv && i<fan.nrecv(); i++) {
+        uint64_t flag = recvFlag(i);
+        uint64_t* ptr = recvPtr(i)+ll128Offset;
+        bool needReload;
+        int spins = 0;
+        do {
+          needReload = false;
+          #pragma unroll
+          for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+            vr[u] = __builtin_nontemporal_load(ptr+u*WARP_SIZE);
+            vr[u+1] = __builtin_nontemporal_load(ptr+u*WARP_SIZE+1);
+            needReload |= flagThread && (vr[u+1] != flag);
+          }
+          needReload &= (0 == checkAbort(spins, i, 0));
+        } while (__any(needReload));
+
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          v[u]   = applyReduce(redOp, vr[u], v[u]);
+          v[u+1] = applyReduce(redOp, vr[u+1], v[u+1]);
+        }
+      }
+    }
+    /********************** End Recv ************************/
+
+    if (postOp) {
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        v[u]   = applyPostOp(redOp, v[u]);
+        v[u+1] = applyPostOp(redOp, v[u+1]);
+      }
+    }
+
+#if RCCL_USE_WBINVL1_VOL
+    if (tid == 0) __builtin_amdgcn_buffer_wbinvl1();
+#endif
+    /************************ Send **************************/
+    if (SEND) {
+      for (int i=1; i<MaxSend && i<fan.nsend(); i++) {
+        uint64_t flag = sendFlag(i);
+        uint64_t* ptr = sendPtr(i)+ll128Offset;
+        #pragma unroll
+        for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+          __builtin_nontemporal_store(v[u], ptr+u*WARP_SIZE);
+          __builtin_nontemporal_store(flagThread ? flag : v[u+1], ptr+u*WARP_SIZE+1);
+        }
+      }
+      uint64_t flag = sendFlag(0);
+      uint64_t* ptr = sendPtr(0)+ll128Offset;
+      #pragma unroll
+      for (int u=0; u<ELEMS_PER_THREAD; u+=2) {
+        __builtin_nontemporal_store(v[u], ptr+u*WARP_SIZE);
+        __builtin_nontemporal_store(flagThread ? flag : v[u+1], ptr+u*WARP_SIZE+1);
+      }
+    }
+    /********************** End Send ************************/
+  }
+
+  static constexpr int WireWordPerSlice = WARP_SIZE*NCCL_LL128_SHMEM_ELEMS_PER_THREAD;
+  static constexpr int DataEltPerSlice = (WireWordPerSlice - WireWordPerSlice/NCCL_LL128_LINEELEMS)*(sizeof(uint64_t)/sizeof(T));
+
+  template <int RECV, int SEND, int SrcBuf, int DstBuf>
+  __device__ __forceinline__ void GenericOp(intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp) {
+    constexpr int SRC = SrcBuf != -1 ? 1 : 0;
+    constexpr int DST = DstBuf != -1 ? 1 : 0;
+    T const *srcPtr = SrcBuf == -1 ? nullptr : userBufs[SrcBuf] + srcIx;
+    T       *dstPtr = DstBuf == -1 ? nullptr : userBufs[DstBuf] + dstIx;
+    int wireOffset = WireWordPerSlice*warp + 2*wid;
+    const int nwarps = nthreads/WARP_SIZE;
+    nelem = nelem < 0 ? 0 : nelem;
+
+    if (SEND) waitSend(divUp(nelem, DataEltPerSlice)*WireWordPerSlice*sizeof(uint64_t));
+    barrier();
+    nelem -= DataEltPerSlice*warp;
+    srcPtr += DataEltPerSlice*warp;
+    dstPtr += DataEltPerSlice*warp;
+    while (nelem > 0) {
+      const int eltInSlice = min(nelem, DataEltPerSlice);
+      uint64_t regs[NCCL_LL128_SHMEM_ELEMS_PER_THREAD];
+      if (SRC) loadRegsBegin(regs, srcPtr, eltInSlice);
+      recvReduceSendCopy<NCCL_LL128_SHMEM_ELEMS_PER_THREAD, RECV, SEND, SrcBuf, DstBuf>(regs, wireOffset, postOp);
+      if (DST) storeRegs(dstPtr, regs, eltInSlice);
+
+      wireOffset += WireWordPerSlice*nwarps;
+      srcPtr += DataEltPerSlice*nwarps;
+      dstPtr += DataEltPerSlice*nwarps;
+      nelem -= DataEltPerSlice*nwarps;
+    }
+
+    barrier();
+    if (SEND) for (int i=0; i < MaxSend; i++) sendStep[i] += 1;
+    if (SEND) postSend();
+    if (RECV) for (int i=0; i < MaxRecv; i++) recvStep[i] += 1;
+    if (RECV) postRecv();
+  }
+
+  template <int REDUCE, int COPY, int MULTISRCS, int MULTIDSTS>
+  __device__ __forceinline__ void mscclGenericOp(T** srcs, int nsrcs, T** dsts, int ndsts, int nelem) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+    T const *srcPtr = srcs[0];
+    T       *dstPtr = dsts[0];
+    int wireOffset = WireWordPerSlice*warp + 2*wid;
+    const int nwarps = nthreads/WARP_SIZE;
+    nelem = nelem < 0 ? 0 : nelem;
+
+    nelem -= DataEltPerSlice*warp;
+    srcPtr += DataEltPerSlice*warp;
+    dstPtr += DataEltPerSlice*warp;
+    if (MULTISRCS){
+      for (int i = 1; i < nsrcs; i++){
+        srcs[i] += DataEltPerSlice*warp;
+      }
+    }
+    if (MULTIDSTS){
+      for (int i = 1; i < ndsts; i++){
+        dsts[i] += DataEltPerSlice*warp;
+      }
+    }
+    while (nelem > 0) {
+      const int eltInSlice = min(nelem, DataEltPerSlice);
+      uint64_t regs[NCCL_LL128_SHMEM_ELEMS_PER_THREAD];
+      loadRegsBegin(regs, srcPtr, eltInSlice);
+      loadRegsFinish(regs);
+      if (REDUCE){
+        uint64_t regsD[NCCL_LL128_SHMEM_ELEMS_PER_THREAD];
+        loadRegsBegin(regsD, dstPtr, eltInSlice);
+        loadRegsFinish(regsD);
+        #pragma unroll
+        for (int u=0; u<NCCL_LL128_SHMEM_ELEMS_PER_THREAD; u+=2) {
+          regsD[u] = applyReduce(redOp, regs[u], regsD[u]);
+          if (!flagThread)
+            regsD[u+1] = applyReduce(redOp, regs[u+1], regsD[u+1]);
+        }
+        if (MULTISRCS){
+          for (int i = 1; i < nsrcs; i++){
+            loadRegsBegin(regs, srcs[i], eltInSlice);
+            loadRegsFinish(regs);
+            for (int u=0; u<NCCL_LL128_SHMEM_ELEMS_PER_THREAD; u+=2) {
+              regsD[u] = applyReduce(redOp, regs[u], regsD[u]);
+              if (!flagThread)
+                regsD[u+1] = applyReduce(redOp, regs[u+1], regsD[u+1]);
+            }
+          }
+        }
+        storeRegs(dstPtr, regsD, eltInSlice);
+      }
+      if (COPY){
+        storeRegs(dstPtr, regs, eltInSlice);
+        if (MULTIDSTS){
+          for (int i = 1; i < nsrcs; i++){
+            loadRegsBegin(regs, srcs[i], eltInSlice);
+            loadRegsFinish(regs);
+            storeRegs(dsts[i], regs, eltInSlice);
+          }
+        }
+      }
+
+      wireOffset += WireWordPerSlice*nwarps;
+      srcPtr += DataEltPerSlice*nwarps;
+      dstPtr += DataEltPerSlice*nwarps;
+      if (MULTISRCS){
+        for (int i = 1; i < nsrcs; i++){
+          srcs[i] += DataEltPerSlice*nwarps;
+        }
+      }
+      if (MULTIDSTS){
+        for (int i = 1; i < ndsts; i++){
+          dsts[i] += DataEltPerSlice*nwarps;
+        }
+      }
+      nelem -= DataEltPerSlice*nwarps;
+    }
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+    barrier();
+  }
+
+  __device__ __forceinline__ void loadRecvConn(struct ncclConnInfo* conn, int i) {
+    recvBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128];
+    recvStep[i] = conn->step;
+    if (wid == i) recvConn = conn;
+  }
+  __device__ __forceinline__ void loadRecvSync() {
+    if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv()) {
+      recvConnHeadPtr = recvConn->head;
+      recvConnHead = recvConn->step;
+    }
+  }
+
+  __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) {
+    sendBuff[i] = (uint64_t*)conn->buffs[NCCL_PROTO_LL128];
+    sendStep[i] = conn->step;
+    if (wid == i) sendConn = conn;
+  }
+  __device__ __forceinline__ void loadSendSync() {
+    if (tid < fan.nsend()) {
+      sendConnHeadPtr = sendConn->head;
+      sendConnHeadCache = *sendConnHeadPtr;
+      sendConnHead = sendConn->step;
+      sendConnFifoPtr = sendConn->sizesFifo;
+    }
+    if (tid >= nthreads-WARP_SIZE && wid<fan.nsend()) {
+      if (sendConn->sizesFifo) {
+        sendConnTailPtr = sendConn->tail;
+        sendConnTail = sendConn->step;
+      }
+    }
+  }
+
+public:
+  __device__ Primitives(
+      const int tid, const int nthreads, int const *recvPeers, int const *sendPeers,
+      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
+      uint8_t connIndexRecv=0, uint8_t connIndexSend=0
+    ):
+    redOp(redOpArg),
+    tid(tid), nthreads(nthreads), wid(tid%WARP_SIZE), warp(tid/WARP_SIZE),
+    warpInBlock(threadIdx.x/WARP_SIZE),
+    flagThread((tid%4)==3), group(group),
+    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_LL128]/NCCL_STEPS/sizeof(uint64_t)) {
+    auto *channel = &ncclShmem.channel;
+    barriers = &ncclShmem.groups[group].barrier;
+    barrier_next = ncclShmem.groups[group].barrier_next;
+    int nrecv=0, nsend=0;
+    while (nrecv < MaxRecv && recvPeers[nrecv] >= 0) {
+      loadRecvConn(&channel->peers[recvPeers[nrecv]]->recv[connIndexRecv], nrecv);
+      nrecv++;
+    }
+    while (nsend < MaxSend && sendPeers[nsend] >= 0) {
+      loadSendConn(&channel->peers[sendPeers[nsend]]->send[connIndexSend], nsend);
+      nsend++;
+    }
+    this->fan = Fan(nrecv, nsend);
+    loadRecvSync();
+    loadSendSync();
+    setDataPtrs(inputBuf, outputBuf);
+  }
+
+  __device__ ~Primitives() {
+    // Save steps for the next operation
+    if (tid >= nthreads-WARP_SIZE && wid < fan.nrecv())
+      recvConn->step = recvConnHead;
+    if (tid < fan.nsend())
+      sendConn->step = sendConnHead;
+    // Ensure all steps written back
+    barrier();
+  }
+
+  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf) {
+    userBufs[Input] = (T*)inputBuf;
+    userBufs[Output] = (T*)outputBuf;
+  }
+
+  __device__ void moveDataPtrs(intptr_t delta) {
+    userBufs[Input] += delta;
+    userBufs[Output] += delta;
+  }
+
+  __device__ void send(intptr_t inpIx, int eltN) {
+    return GenericOp<0, 1, Input, -1>(inpIx, -1, eltN, false);
+  }
+  __device__ void sendFromOutput(intptr_t outIx, int eltN) {
+    return GenericOp<0, 1, Output, -1>(outIx, -1, eltN, false);
+  }
+  __device__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
+    return GenericOp<1, 0, -1, Output>(-1, outIx, eltN, postOp);
+  }
+  __device__ void recvReduceSend(intptr_t inpIx, int eltN) {
+    return GenericOp<1, 1, Input, -1>(inpIx, -1, eltN, false);
+  }
+  __device__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    return GenericOp<1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
+  __device__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    return GenericOp<0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
+  __device__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
+    return GenericOp<1, 1, -1, Output>(-1, outIx, eltN, postOp);
+  }
+  __device__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    return GenericOp<1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
+  __device__ void recvSend(int eltN) {
+    return GenericOp<1, 1, -1, -1>(-1, -1, eltN, false);
+  }
+
+  // MSCCL primitives
+  __device__ void sendWithBarrier(intptr_t inpIx, int eltN) {
+    send(inpIx, eltN);
+  }
+  __device__ void localCopy(T* srcs, T* dsts, int eltN) {
+    return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
+  }
+  __device__ void reduce(T** srcs, int nsrcs, T** dsts, int ndsts, int eltN) {
+    if (nsrcs == 1) {
+      return mscclGenericOp<1,0,0,0>(srcs, 1, dsts, 1, eltN);
+    } else {
+      return mscclGenericOp<1,0,1,0>(srcs, nsrcs, dsts, 1, eltN);
+    }
+  }
+};
--- a/rccl/src/collectives/device/prims_simple.h
+++ b/rccl/src/collectives/device/prims_simple.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#if defined(ENABLE_NPKIT)
+#include "npkit/npkit.h"
+#endif
+
+#include "msccl/msccl_struct.h"
+
+#ifdef HYGON_SDMA_FEATURE
+#include "op128.h"
+//#define GC_COPY_DATA
+
+typedef enum {
+  SDMA_TRANS_MODE_NON         = 0,
+  SDMA_TRANS_MODE_REDUCE_SEND = 1,
+  SDMA_TRANS_MODE_SRC_SEND    = 2,
+  SDMA_TRANS_MODE_RECV_SEND   = 3
+} sdma_trans_mode_t;
+
+template<typename T, int BytePerPack>
+__device__ __forceinline__ void memsetData(int ringIx, uint64_t srcAddr, uint32_t dataSize, int val)
+{
+  BytePack<BytePerPack> setVal;
+
+  if (srcAddr == 0 || dataSize == 0) {
+    PRINT_ERR("memsetData error input, ringIx:%d bid:%d copy data srcAddr:0x%lx len %d\n", ringIx, (int)blockIdx.x, srcAddr, dataSize);
+    return;
+  }
+
+  setVal.native = val;
+  for (int i = 0; i < dataSize; i++) {
+    st_global<BytePerPack>(srcAddr, setVal);
+    srcAddr += BytePerPack;
+  }
+  return;
+}
+
+template<typename T, int BytePerPack>
+__device__ __forceinline__ int compareData(int ringIx, uint64_t srcAddr, uint64_t dstAddr, uint32_t dataSize)
+{
+  BytePack<BytePerPack> srcVal;
+  BytePack<BytePerPack> dstVal;
+  int miscompare = 0;
+
+  if (srcAddr == 0 || dstAddr == 0 || dataSize == 0) {
+    PRINT_ERR("compareData error input, ringIx:%d bid:%d copy data srcAddr:0x%lx dstAddr:0x%lx len %d\n", ringIx, (int)blockIdx.x, srcAddr, dstAddr, dataSize);
+    return 0;
+  }
+
+  for (int i = 0; i < dataSize; i++) {
+    srcVal = ld_volatile_global<BytePerPack>(srcAddr);
+    dstVal = ld_volatile_global<BytePerPack>(dstAddr);
+    if (srcVal.native != dstVal.native) {
+      PRINT_INFO("compareData, ringIx:%d bid:%d miscompare index:%d srcVal[0x%lx]:%d dstVal[0x%lx]:%d\n", 
+        ringIx, (int)blockIdx.x, i, srcAddr, srcVal.native, dstAddr, dstVal.native);
+      miscompare++;
+    }
+
+    srcAddr += BytePerPack;
+    dstAddr += BytePerPack;
+  }
+
+  if (miscompare) {
+    PRINT_INFO("compareData end error, ringIx:%d bid:%d miscompare count:%d  dataSize:%d \n", ringIx, (int)blockIdx.x, miscompare, dataSize);
+  } else {
+    PRINT_INFO("compareData end ok, ringIx:%d bid:%d same data, last:%d\n", ringIx, (int)blockIdx.x, srcVal.native);
+  }
+  return miscompare;
+}
+
+template<typename T, int BytePerPack>
+__device__ __forceinline__ int startSdmaTask(struct sdmaQueueInfo *sdmaQueue, int ringIx, uint64_t srcAddr, uint64_t dstAddr, uint32_t dataLen)
+{
+  if (srcAddr == 0 || dstAddr == 0) {
+    PRINT_ERR("startSdma error input, ringIx:%d bid:%d srcAddr:%p dstAddr:%p len:%d\n", ringIx, (int)blockIdx.x, srcAddr, dstAddr, dataLen);
+    return -1;
+  }
+
+#ifdef GC_COPY_DATA
+  BytePack<BytePerPack> val;
+  for (int i = 0; i < dataLen / BytePerPack; i++) {
+    val = ld_volatile_global<BytePerPack>(srcAddr);
+    st_global<BytePerPack>(dstAddr, val);
+    srcAddr += BytePerPack;
+    dstAddr += BytePerPack;
+  }
+  return 0;
+#endif
+
+  uint32_t sdmaIndex = atomicAdd(sdmaQueue->pkgIndex, 1) % sdmaQueue->sdmaDepth;
+  volatile hsa_sdma_info_t *sdmaInfo = &sdmaQueue->sdmaInfo[sdmaIndex];
+
+  if (*sdmaInfo->wptr == *sdmaInfo->rptr) {
+    PRINT_ERR("ringIx:%d bid:%d sdma pkg is empty\n", ringIx, (int)blockIdx.x);
+  }
+
+  sdmaInfo->completion_signal = 1;
+  sdmaInfo->src_addr = srcAddr;
+  sdmaInfo->dst_addr = dstAddr;
+  sdmaInfo->data_size = dataLen;
+  //sdmaInfo->flag = NPKIT_GET_GPU_TIMESTAMP();
+  sdmaInfo->dep_signal = 1;
+  return sdmaIndex;
+}
+
+inline __device__ uint64_t waitSdmaTaskComplete(struct sdmaQueueInfo *sdmaQueue, uint32_t sdmaIndex)
+{
+  if (sdmaIndex < 0) {
+    PRINT_ERR("waitSdmaTaskComplete bid:%d sdmaIndex:%d invalid sdma index \n", (int)blockIdx.x, sdmaIndex);
+    return 0;
+  }
+
+#ifdef GC_COPY_DATA
+  return 0;
+#endif
+
+  volatile hsa_sdma_info_t *sdmaInfo = &sdmaQueue->sdmaInfo[sdmaIndex];
+
+  while (sdmaInfo->completion_signal) {
+    __builtin_amdgcn_s_sleep(1);
+
+  }
+
+  __asm__ __volatile__("s_wakeup");
+
+  return sdmaInfo->end_ts - sdmaInfo->start_ts;
+}
+
+#endif
+
+template<typename T, typename RedOp, typename Fan, int Direct,
+         int SlicePerChunk, int StepPerSlice, int Unroll, int P2p, int MultimemSrcs, int MultimemDsts>
+class Primitives<
+    T, RedOp, Fan, Direct, ProtoSimple<SlicePerChunk, StepPerSlice, Unroll, MultimemSrcs, MultimemDsts>, P2p
+  > {
+  static constexpr int MaxRecv = Fan::MaxRecv, MaxSend = Fan::MaxSend;
+  static constexpr int Input=0, Output=1;
+  static constexpr int RoleInput = 0x01,
+                       RoleOutput = 0x02,
+                       RoleWaitRecv = 0x04,
+                       RoleWaitSend = 0x08,
+                       RolePostSend = 0x10,
+                       RolePostRecv = 0x20,
+                       Aborted = 0x40,
+                       OffsFifoEnabled = 0x80,
+                       SizesFifoEnabled = 0x100,
+                       DirectWrite = 0x200,
+                       DirectRead = 0x400,
+                       ThreadsSynced = 0x800,
+                       NvlsMinPolling = 0x1000;
+  const int tid, tidInBlock;
+  const int nthreads;
+  int nworkers;
+  const int stepSize;
+  Fan fan;
+  int index; // Peer index I'm responsible for
+  int flags;
+  int group;
+  uint64_t step;
+  int *connOffsFifoPtr;   // (flags & OffsFifoEnabled)
+  union {
+    T *userBuff;            // (flags & (RoleInput|RoleOutput))
+    T *connEltsFifo;        // !(flags & (RoleInput|RoleOutput))
+  };
+  union {
+    int volatile *connSizesFifoPtr; //  (flags & SizesFifoEnabled)
+    T *directBuff;                  // !(flags & SizesFifoEnabled)
+  };
+  uint64_t *connStepPtr;
+  uint64_t connStepCache; // Cache last seen value of (*connStepPtr)
+  uint64_t* barriers;
+  uint64_t* barrier_next;
+  uint32_t* next_hdp_reg;
+
+#ifdef HYGON_SDMA_FEATURE
+public:
+  uint32_t ringIx;
+  uint32_t useSdmaCopy;
+  uint32_t sdmaMinCopySize;
+  uint32_t sdmaCountEnabe;
+  uint32_t sdmaCopyCount;
+  uint32_t allCopyCount;
+private:
+#endif
+
+#if defined(ENABLE_NPKIT)
+public:
+  int npKitCtxIdx = 0;
+  uint64_t npKitDataProcessEntryTime = 0;
+  uint64_t npKitDataProcessExitTime = 0;
+  uint64_t npKitDataProcessTotalTime = 0;
+private:
+#endif
+
+  // Don't use barrier 0 as it's used by the final sync
+  inline __device__ void barrier() {
+    flags |= ThreadsSynced;
+    if (nthreads == WARP_SIZE)
+      __syncwarp();
+    else
+      barrier_by_group();
+  }
+
+  inline __device__ void subBarrier() {
+    barrier();
+  }
+
+  inline __device__ bool checkAbort(int &spins) {
+    spins++;
+    if (!(flags & Aborted) && spins == NCCL_SPINS_BEFORE_CHECK_ABORT) {
+      if (atomicAdd_system((unsigned int *)ncclShmem.comm.abortFlag, 0)) {
+        flags |= Aborted;
+        ncclShmem.aborted = 1;
+      }
+      spins = 0;
+    }
+    return flags & Aborted;
+  }
+
+  inline __device__ uint64_t loadStepValue(uint64_t* ptr) {
+    #if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
+    if (flags & NvlsMinPolling) {
+      uint64_t ans;
+      asm("multimem.ld_reduce.acquire.sys.global.min.u64 %0, [%1];" : "=l"(ans) : "l"(cvta_to_global(ptr)));
+      return ans;
+    }
+    #endif
+    // volatile is faster than acquire but not as correct. Make sure reduceCopy
+    // loads data using volatile so it doesn't see stale data in L1.
+#ifdef __GFX9__
+    return atomicAdd((unsigned long long *)ptr, 0);
+#else
+    return __atomic_load_n(ptr, __ATOMIC_SEQ_CST);
+#endif
+  }
+
+  template <int DirectRecv, int DirectSend, int Recv, int Send, int Src, int Dst>
+  __device__ __forceinline__ void waitPeer(intptr_t srcIx, intptr_t dstIx, int offset, int nelts) {
+    const bool isSendNotRecv = (Send && Recv) ? (flags & RoleWaitSend) : Send;
+    const bool noRecvWait = DirectRecv && Src && (flags & DirectRead);        // no wait when directly reading from remote input
+    const bool noSendWait = DirectSend && (flags & (DirectRead|DirectWrite)); // no wait in empty send (e.g. directScatter) or direct remote write
+    if (((flags & (Recv*RoleWaitRecv)) && !noRecvWait) ||
+        ((flags & (Send*RoleWaitSend)) && !noSendWait)) {
+      int spins = 0;
+      while (connStepCache + (isSendNotRecv ? NCCL_STEPS : 0) < step + StepPerSlice) {
+        __builtin_amdgcn_s_sleep(1);
+        connStepCache = loadStepValue(connStepPtr);
+        if (checkAbort(spins)) break;
+        //if (spins == 0) printf("r=%d b=%d t=%d SPUN OUT got=%d want=%d\n", ncclShmem.comm.rank, blockIdx.x, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
+        if (spins == 0) traceData(__LINE__, threadIdx.x, int(connStepCache + (isSendNotRecv ? NCCL_STEPS : 0)), int(step+StepPerSlice));
+      }
+      __asm__ __volatile__("s_wakeup");
+    }
+
+    if (flags & (Recv*RoleWaitRecv | Send*RoleWaitSend)) {
+      if (isSendNotRecv && (flags & SizesFifoEnabled))
+        __atomic_store_n(connSizesFifoPtr+step%NCCL_STEPS, nelts*sizeof(T), __ATOMIC_SEQ_CST);
+
+      void **ptrs = isSendNotRecv ? (ncclShmem.groups[group].dsts + Dst)
+                                  : (ncclShmem.groups[group].srcs + Src);
+      if (flags & OffsFifoEnabled)
+        ptrs[index] = connEltsFifo + loadInt(connOffsFifoPtr + (step%NCCL_STEPS))/sizeof(T);
+      else if (isSendNotRecv && DirectSend) {
+        if (flags & DirectWrite) {
+          ptrs[index] = directBuff + dstIx + offset;
+        } else if (flags & DirectRead) {  // empty send
+          ptrs[index] = nullptr;
+        } else {
+          ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+        }
+      } else if (!isSendNotRecv && DirectRecv) {
+        if (flags & DirectRead) {
+          ptrs[index] = directBuff + srcIx + offset;
+        } else if (flags & DirectWrite) {
+          ptrs[index] = directBuff + dstIx + offset;  // send to next from my output buffer
+        } else {
+          ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+        }
+      }
+      else {
+        ptrs[index] = connEltsFifo + (step%NCCL_STEPS)*stepSize;
+      }
+      step += StepPerSlice;
+    }
+  }
+
+  template<int Recv, int Send>
+  inline __device__ void postPeer(bool dataStored) {
+    if (Send && (flags & RolePostSend) && dataStored)
+#ifdef __GFX9__
+      __builtin_amdgcn_buffer_wbinvl1();
+#else
+      __threadfence_system();
+#endif
+
+    if ((flags & Send*RolePostSend) && next_hdp_reg)
+      STORE((unsigned int *)next_hdp_reg, 0x1);
+
+    if (flags & (Recv*RolePostRecv | Send*RolePostSend)) {
+      step += StepPerSlice;
+      STORE(connStepPtr, step);
+    }
+  }
+
+  template <int DirectRecv1, int DirectSend1, int Recv, int Send, int SrcBuf, int DstBuf>
+  __device__ __forceinline__ void genericOp(
+      intptr_t srcIx, intptr_t dstIx, int nelem, bool postOp
+    ) {
+    constexpr int DirectRecv = 1 && Direct && DirectRecv1;
+    constexpr int DirectSend = 1 && Direct && DirectSend1;
+    constexpr int Src = SrcBuf != -1;
+    constexpr int Dst = DstBuf != -1;
+
+    nelem = nelem < 0 ? 0 : nelem;
+    int sliceSize = stepSize*StepPerSlice;
+    sliceSize = max(divUp(nelem, 16*SlicePerChunk)*16, sliceSize/32);
+    int slice = 0;
+    int offset = 0;
+
+#ifdef HYGON_SDMA_FEATURE
+    uint64_t srcSdmaAddr = 0;
+    uint64_t dstSdmaAddr = 0;
+    int sendToNextRankMode = SDMA_TRANS_MODE_NON;
+    int needSdmaCopy = 0;
+
+    if (useSdmaCopy) {
+      if (Send == 1 && fan.nsend() == 1) {
+        if (Src && Recv) {
+          sendToNextRankMode = SDMA_TRANS_MODE_REDUCE_SEND;  // 1
+        } else if (Src && !Recv) {
+          sendToNextRankMode = SDMA_TRANS_MODE_SRC_SEND;     // 2
+        } else if (!Src && Recv) {
+          sendToNextRankMode = SDMA_TRANS_MODE_RECV_SEND;    // 3
+        }
+        if (sendToNextRankMode) needSdmaCopy = 1;
+      }
+    }
+#endif
+
+    PRINT_DEBUG("genericOp-1- ringIx:%d bid:%d sliceSize:%d nelem:%d SlicePerChunk:%d stepSize:%d StepPerSlice:%d slicesize0:%d max(val1:%d val2:%d) mode:%d send:%d %d sizeofT:%d\n", 
+      ringIx, (int)blockIdx.x, sliceSize*sizeof(T), nelem*sizeof(T), SlicePerChunk, stepSize*sizeof(T), StepPerSlice, stepSize*StepPerSlice*sizeof(T), 
+      divUp(nelem, 16*SlicePerChunk)*16*sizeof(T), sizeof(T)*stepSize*StepPerSlice/32, sendToNextRankMode, Send, MaxSend, sizeof(T));
+
+    if (tid < nworkers && offset < nelem) {
+      // Worker-only loop for non-empty slices. Non-workers and empty slices are
+      // processed in the loop following this if block. The benefit of splitting
+      // the loop like this is we pull two branches out of the critical path.
+      // Using "number of branch insns (taken or not) encountered dynamically"
+      // as the performance metric, then:
+      //   perf_orig = 2*numslices
+      //   perf_new = 2+numslices
+      // So the new code and old code behave the same for numslices=2, and for
+      // numslices>2 the new code is superior. And note that in the case
+      // numslices=1, the loop is trivially unrollable (single iteration) so we
+      // don't incur that that tail branch and we still have perf_new=2.
+      //
+      // ORIGINAL CODE:
+      //   unrolled for(slices) {
+      //     if(worker) { // This branch removed
+      //       wait();
+      //       subBarrier();
+      //       if(slice not empty) // This branch removed
+      //         ReduceCopyMulti();
+      //     }
+      //     barrier();
+      //     post();
+      //   } // Since we no longer unroll, new branch added here
+      #pragma unroll 1
+      do {
+        sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
+        if (Src && (flags & (SrcBuf==Input ? RoleInput : RoleOutput)))
+          ncclShmem.groups[group].srcs[0] = userBuff + srcIx + offset;
+        if (Dst && (flags & (DstBuf==Input ? RoleInput : RoleOutput)))
+          ncclShmem.groups[group].dsts[0] = userBuff + dstIx + offset;
+        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(srcIx, dstIx, offset, sliceSize);
+        subBarrier();
+        /* if user abort the kernel, we don't need to actually perform copy/reduce; just set size
+         * to 0 to avoid unnecessary workload. */
+        int workSize = ncclShmem.aborted ? 0 : sliceSize;
+#ifdef HYGON_SDMA_FEATURE
+        if (tid == 0 && sdmaCountEnabe) allCopyCount++;
+        if (needSdmaCopy && workSize*sizeof(T) < sdmaMinCopySize) {
+          sendToNextRankMode = 0;
+          needSdmaCopy = 0;
+          PRINT_DEBUG("genericOp-sdma- ringIx:%d bid:%d workSize:%d minCopySize:%d\n", 
+            ringIx, (int)blockIdx.x, workSize*sizeof(T), sdmaMinCopySize);
+        }
+        if (tid == 0 && sendToNextRankMode && useSdmaCopy) {
+          // SDMA拷贝源地址是Src或Recv地址，只有Src时Src占用srcs[0]，只有Recv时Recv占用srcs[0]，同时有时，Src占用srcs[0]，Recv占用srcs[1]
+          srcSdmaAddr = (uint64_t)ncclShmem.groups[group].srcs[Src*Recv];
+          // SDMA拷贝目的地址是Send地址，只有Dst时Dst占用dsts[0]，只有Send时Send占用dsts[0]，同时有时，Dst占用dsts[0]，Send占用dsts[1]
+          dstSdmaAddr = (uint64_t)ncclShmem.groups[group].dsts[Dst];
+          if (sendToNextRankMode == SDMA_TRANS_MODE_REDUCE_SEND) {
+            // 同时有Src和Recv时，将Reduce计算后数据保存目的地址设置为srcs[1]，srcs[1]是作为SDMA拷贝的源地址
+            ncclShmem.groups[group].dsts[Dst] = ncclShmem.groups[group].srcs[Src*Recv];
+          }
+        }
+        if (useSdmaCopy) subBarrier();
+#endif
+
+        if (DirectRecv && ncclShmem.groups[group].srcs[0] == ncclShmem.groups[group].dsts[0]) {
+          // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy
+          if (Send) {
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
+            if (tid == 0) {
+              NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+                  ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+            }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+            if (tid == 0) {
+              npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
+            }
+#endif
+
+            reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, MaxSend, /*PreOpSrcs*/0>
+              (tid, nworkers, /*redArg*/0, /*preOpArgs*/nullptr, /*postOp*/false,
+               1, ncclShmem.groups[group].srcs,
+               fan.nsend(), ncclShmem.groups[group].dsts+1,
+               workSize);
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+            if (tid == 0) {
+              npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
+              npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
+            }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
+            if (tid == 0) {
+              NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+                  ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+            }
+#endif
+
+          }
+        } else if (DirectSend && !DirectRecv && SrcBuf != Input && ncclShmem.groups[group].dsts[Dst] == nullptr) {
+          // For broadcast in CollNet to do empty send
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
+          if (tid == 0) {
+            NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+          }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+          if (tid == 0) {
+            npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
+          }
+#endif
+
+          reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, /*PreOpSrcs*/0>
+            (tid, nworkers, ncclShmem.redOpArgs[0],  nullptr, postOp,
+             Recv, ncclShmem.groups[group].srcs,
+             Dst, ncclShmem.groups[group].dsts,
+             workSize);
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+          if (tid == 0) {
+            npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
+            npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
+          }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
+          if (tid == 0) {
+            NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+          }
+#endif
+
+        } else {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
+          if (tid == 0) {
+            NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+          }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+          if (tid == 0) {
+            npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
+          }
+#endif
+
+          constexpr int PreOpSrcs = SrcBuf != Input ? 0 :
+                                    DirectRecv*MaxRecv == NCCL_MAX_DIRECT_ARITY ? (1+NCCL_MAX_DIRECT_ARITY) : 1;
+
+          PRINT_DEBUG("genericOp-4-ringIx:%d bid:%d MaxRecv:%d MaxSend:%d PreOpSrcs:%d nworkers:%d Recv:%d fan.nrecv:%d Src:%d Send:%d fan.nsend:%d Dst:%d "
+                  "workSize:%d group:%d Unroll:%d mode:%d sdma:%d sq:%p src:0x%lx dst:0x%lx mins:%d\n", 
+                  ringIx, (int)blockIdx.x, MaxRecv, MaxSend, PreOpSrcs, nworkers, Recv, fan.nrecv(), Src, Send, fan.nsend(), Dst, workSize*sizeof(T), group, Unroll,
+                  sendToNextRankMode, useSdmaCopy, ncclShmem.channel.sdmaQueue.sdmaInfo, srcSdmaAddr, dstSdmaAddr, sdmaMinCopySize);
+#ifdef HYGON_SDMA_FEATURE
+          if (sendToNextRankMode <= SDMA_TRANS_MODE_REDUCE_SEND) {
+            reduceCopy<Unroll, RedOp, T,
+              MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
+              MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
+              (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
+              Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
+              Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
+              workSize);
+          } else if (Dst) {
+            uint32_t sdmaIndex;
+            uint64_t delta_ts;
+
+            if (tid == 0) {
+              NPKIT_SET_GPU_EVENT(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_PAL_ENTRY, workSize*sizeof(T), 0);
+              sdmaIndex = startSdmaTask<T, sizeof(T)>(&ncclShmem.channel.sdmaQueue, ringIx, srcSdmaAddr, dstSdmaAddr, workSize * sizeof(T));
+            }
+
+            reduceCopy<Unroll, RedOp, T,
+              MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
+              MultimemDsts, SDMA_SPEC_DST, Dst, PreOpSrcs>
+              (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
+              Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
+              Dst, ncclShmem.groups[group].dsts,
+              workSize);
+
+            needSdmaCopy = 0;
+            if (tid == 0) {
+              delta_ts = waitSdmaTaskComplete(&ncclShmem.channel.sdmaQueue, sdmaIndex);
+#if defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST)
+              NPKIT_SET_GPU_EVENT_TM(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_ENTRY, workSize * sizeof(T), 0,  ncclShmem.channel.sdmaQueue.sdmaInfo[sdmaIndex].start_ts);
+              NPKIT_SET_GPU_EVENT_TM(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_EXIT, workSize * sizeof(T), 0, ncclShmem.channel.sdmaQueue.sdmaInfo[sdmaIndex].end_ts);
+#endif
+              NPKIT_SET_GPU_EVENT(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_PAL_EXIT, workSize*sizeof(T), delta_ts);
+              if (sdmaCountEnabe) sdmaCopyCount++;
+            }
+          }
+#else
+          reduceCopy<Unroll, RedOp, T,
+            MultimemSrcs, Recv+Src, Recv*MaxRecv+Src,
+            MultimemDsts, Send+Dst, Send*MaxSend+Dst, PreOpSrcs>
+            (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp,
+            Recv*fan.nrecv()+Src, ncclShmem.groups[group].srcs,
+            Send*fan.nsend()+Dst, ncclShmem.groups[group].dsts,
+            workSize);
+#endif
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+          if (tid == 0) {
+            npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
+            npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
+          }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
+          if (tid == 0) {
+            NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, sliceSize*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+                ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+          }
+#endif
+
+        }
+        barrier(); // This barrier has a counterpart in following loop
+
+#ifdef HYGON_SDMA_FEATURE
+        if (tid == 0 && sendToNextRankMode && useSdmaCopy && needSdmaCopy) {
+          PRINT_DEBUG("genericOp-5-ringIx:%d bid:%d MaxRecv:%d MaxSend:%d Recv:%d fan.nrecv:%d Src:%d Send:%d fan.nsend:%d Dst:%d "
+                "workSize:%d mode:%d sdma:%d need:%d sq:%p src:0x%lx dst:0x%lx\n", 
+                ringIx, (int)blockIdx.x, MaxRecv, MaxSend, Recv, fan.nrecv(), Src, Send, fan.nsend(), Dst, workSize*sizeof(T), 
+                sendToNextRankMode, useSdmaCopy, needSdmaCopy, ncclShmem.channel.sdmaQueue.sdmaInfo, srcSdmaAddr, dstSdmaAddr);
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+          npKitDataProcessEntryTime = NPKIT_GET_GPU_TIMESTAMP();
+#endif
+
+          NPKIT_SET_GPU_EVENT(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_ENTRY, workSize*sizeof(T), 0);
+          uint32_t sdmaIndex = startSdmaTask<T, sizeof(T)>(&ncclShmem.channel.sdmaQueue, ringIx, srcSdmaAddr, dstSdmaAddr, workSize * sizeof(T));
+          uint64_t delta_ts = waitSdmaTaskComplete(&ncclShmem.channel.sdmaQueue, sdmaIndex);
+#if defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST)
+          NPKIT_SET_GPU_EVENT_TM(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_ENTRY, workSize * sizeof(T), 0,  ncclShmem.channel.sdmaQueue.sdmaInfo[sdmaIndex].start_ts);
+          NPKIT_SET_GPU_EVENT_TM(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COST_EXIT, workSize * sizeof(T), 0, ncclShmem.channel.sdmaQueue.sdmaInfo[sdmaIndex].end_ts);
+#endif
+          NPKIT_SET_GPU_EVENT(NPKIT_EVENT_PRIM_SIMPLE_SDMA_COPY_EXIT, workSize*sizeof(T), delta_ts);
+          if (sdmaCountEnabe) sdmaCopyCount++;
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME)
+          npKitDataProcessExitTime = NPKIT_GET_GPU_TIMESTAMP();
+          npKitDataProcessTotalTime += npKitDataProcessExitTime - npKitDataProcessEntryTime;
+#endif
+        }
+        if (useSdmaCopy) barrier();
+#endif
+
+        postPeer<Recv, Send>(0 < sliceSize);
+        offset += sliceSize;
+        slice += 1;
+      } while (slice < SlicePerChunk && offset < nelem);
+    }
+
+    // Non-workers come straight here. Workers too but only once the remaining
+    // slices are all empty. Since empty slices are the uncommon case, and
+    // worker perf is the limiter, perf-wise this loop is effectively unentered,
+    // hence just a single branch insn.
+    #pragma unroll 1
+    while (slice < SlicePerChunk) {
+      sliceSize = sliceSize < nelem-offset ? sliceSize : nelem-offset;
+      { // Only workers could have Wait roles so we know the slice must be empty
+        // since we've exited the loop above.
+        waitPeer<DirectRecv, DirectSend, Recv, Send, Src, Dst>(0, 0, 0, 0);
+      }
+      barrier(); // Has couterpart in preceding worker-only loop.
+      postPeer<Recv, Send>(0 < sliceSize);
+      offset += sliceSize;
+      slice += 1;
+    }
+  }
+
+  template <int REDUCE, int COPY, int MULTISRCS, int MULTIDSTS>
+  __device__ __forceinline__ void mscclGenericOp(T** srcs, int nsrcs, T** dsts, int ndsts, int nelem) {
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+    nelem = nelem < 0 ? 0 : nelem;
+    if (tid < nworkers) {
+      if (REDUCE){
+        srcs[nsrcs] = dsts[0];
+        nsrcs++;
+        if (MULTISRCS){
+          reduceCopy<Unroll, RedOp, T, 0, 3, MSCCL_MAX_REDUCE_FUSION, 0, 1, 1, 0>
+            (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, nsrcs, (void **)srcs, 1, (void **)dsts, nelem);
+        } else {
+          reduceCopy<Unroll, RedOp, T, 0, 2, 2, 0, 1, 1, 0>
+            (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 2, (void **)srcs, 1, (void **)dsts, nelem);
+        }
+      }
+      if (COPY){
+        reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, 0>
+          (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)srcs, 1, (void **)dsts, nelem);
+        if (MULTISRCS) {
+          for (int i = 1; i < nsrcs; i++){
+            reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, 0>
+              (tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, (void **)&srcs[i], 1, (void **)&dsts[i], nelem);
+          }
+        }
+      }
+    }
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT)
+    if (tid == 0) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT, nelem*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+    barrier();
+  }
+
+  // Scatter/Gather generic op
+  // skip: my own rank order in the buffer chunks
+  // shift: peer offset to avoid all ranks sending to or receiving from same peer
+  template <int DirectRecv1, int DirectSend1, int Recv, int Send>
+  __device__ __forceinline__ void
+  ScatterGatherOp(intptr_t inpIx, intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp) {
+    constexpr int DirectRecv = 1 && Direct && DirectRecv1;
+    constexpr int DirectSend = 1 && Direct && DirectSend1;
+    int offset = 0; // slice offset
+    int sliceSize = stepSize*StepPerSlice;
+    int dataSize = max(DIVUP(peerElem, 16*SlicePerChunk)*16, sliceSize/32);  // per-peer slice size
+
+    #pragma unroll 1
+    for (int slice=0; slice<SlicePerChunk; ++slice) {
+      int realSize = max(0, min(dataSize, peerElem-offset));
+      bool fenceNeeded = false;
+      if (tid < nworkers) {
+        if (Send) {
+          // Scatter pre-scales data of input buffer only in non-Direct case
+          constexpr int PreOpSrcs = DirectSend ? 0 : 1;
+          if (flags & RoleInput) ncclShmem.groups[group].srcs[0] = userBuff + inpIx + offset;
+          // realSize is not accurate here; but intra-node does not rely on sizes FIFO
+          waitPeer<0, DirectSend, 0, 1, 1, 0>(0, inpIx, offset, realSize);
+          subBarrier();
+          #pragma unroll 1
+          // Loop over peers
+          for (int j=0; j<fan.nsend(); j++) {
+            int i = (j+shift)%fan.nsend();
+            int pOffset = i*peerOffset;
+            // Skip the data I am responsible of reducing myself
+            if (skip >= 0 && i >= skip) pOffset += peerElem;
+            void* src0 = (T*)ncclShmem.groups[group].srcs[0] + pOffset;
+            int realPeerSize = min(realSize, totalElem-pOffset);
+            if (realPeerSize > 0 && ncclShmem.groups[group].dsts[i] != nullptr) {
+              reduceCopy<Unroll, RedOp, T, 0, 1, 1, 0, 1, 1, PreOpSrcs>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, false, 1, &src0, 1, ncclShmem.groups[group].dsts+i, realPeerSize);
+              // Mark for threadfence at the end
+              fenceNeeded |= true;
+            }
+          }
+        } else if (Recv) {
+          if (flags & RoleOutput) ncclShmem.groups[group].dsts[0] = userBuff + outIx + offset;
+          int pOffset = index*peerOffset;
+          if (skip >= 0 && index >= skip) pOffset += peerElem;
+          // Adjust remote index with peer offset in case we are directly pulling from peer's output buffer
+          waitPeer<DirectRecv, 0, 1, 0, 0, 1>(outIx, outIx+pOffset, offset, realSize);
+          subBarrier();
+          #pragma unroll 1
+          for (int j=0; j<fan.nrecv(); j++) {
+            int i = (j+shift)%fan.nrecv();
+            pOffset = i*peerOffset;
+            if (skip >= 0 && i >= skip) pOffset += peerElem;
+            void* dst0 = (T*)ncclShmem.groups[group].dsts[0] + pOffset;
+            int realPeerSize = min(realSize, totalElem-pOffset);
+            if (DirectRecv && ncclShmem.groups[group].srcs[i] == dst0) realPeerSize = 0;
+            if (realPeerSize > 0) reduceCopy<Unroll, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>(tid, nworkers, ncclShmem.redOpArgs[0], ncclShmem.redOpArgs, postOp, 1, ncclShmem.groups[group].srcs+i, 1, &dst0, realPeerSize);
+          }
+        }
+      }
+      fenceNeeded = __any(fenceNeeded);
+      postPeer<Recv, Send>(fenceNeeded);
+      offset += realSize;
+    }
+  }
+
+  __device__ __forceinline__ void loadRecvConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
+    if (flags & (RoleWaitRecv|RolePostRecv)) {
+      auto *conn = &peer->recv[connIndex];
+      step = conn->step;
+      step = roundUp(step, SlicePerChunk*StepPerSlice);
+      if (flags & RolePostRecv) {
+        connStepPtr = conn->head;
+        STORE(connStepPtr, step); // Return credits in case we rounded up.
+      }
+      if (flags & RoleWaitRecv) {
+        ncclShmem.groups[group].recvConns[index] = conn; // WaitRecv role saves since that's who needs it in setDataPtrs()
+        flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
+        connStepPtr = conn->tail;
+        connStepCache = loadStepValue(connStepPtr);
+        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
+        if (Direct) {
+          // User buffers have been registered
+          if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
+            if (connIndex == 1 && P2p == 0) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
+                       (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
+            }
+          } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
+            if (connIndex == 1 && P2p == 0) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              // direct read not allowed in non-register case
+              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
+              flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
+            }
+          }
+        }
+        if (flags & OffsFifoEnabled)
+          connOffsFifoPtr = conn->offsFifo;
+        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void loadSendConn(ncclDevChannelPeer *peer, int connIndex, struct ncclWorkElem* e) {
+    if (flags & (RoleWaitSend|RolePostSend)) {
+      auto *conn = &peer->send[connIndex];
+      step = conn->step;
+      step = roundUp(step, SlicePerChunk*StepPerSlice);
+      if (flags & RolePostSend) {
+        connStepPtr = conn->tail;
+	    next_hdp_reg = conn->next_hdp_reg;
+      }
+      if (flags & RoleWaitSend) {
+        ncclShmem.groups[group].sendConns[index] = conn; // WaitSend role saves since that's who needs it in setDataPtrs()
+        flags |= (conn->flags & NCCL_NVLS_MIN_POLL) ? NvlsMinPolling : 0;
+        connStepPtr = conn->head;
+        connStepCache = loadStepValue(connStepPtr);
+        flags |= (conn->offsFifo != nullptr) ? OffsFifoEnabled : 0;
+        if (flags & OffsFifoEnabled)
+          connOffsFifoPtr = conn->offsFifo;
+        connEltsFifo = (T*)conn->buffs[NCCL_PROTO_SIMPLE];
+
+        if (conn->sizesFifo != nullptr) {
+          flags |= SizesFifoEnabled;
+          connSizesFifoPtr = conn->sizesFifo;
+        } else if (Direct) {
+          // User buffers have been registered
+          if ((conn->flags & (NCCL_IPC_READ|NCCL_IPC_WRITE)) && e != nullptr && e->regUsed) {
+            if (connIndex == 1 && P2p == 0) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              flags |= (e->direct & NCCL_DIRECT_WRITE) ? DirectWrite :
+                       (e->direct & NCCL_DIRECT_READ)  ? DirectRead  : 0;
+            }
+          } else if (conn->flags & (NCCL_DIRECT_WRITE|NCCL_DIRECT_READ)) {
+            if (connIndex == 1 && P2p == 0) {
+              flags |= DirectRead;  // scatter-reduce use direct pull
+            } else {
+              // direct read not allowed in non-register case
+              // otherwise, in one-to-multi send, we could mix empty send and intermediate send
+              flags |= (conn->flags & NCCL_DIRECT_WRITE) ? DirectWrite : 0;
+            }
+          }
+        }
+      }
+    }
+  }
+
+ public:
+  __forceinline__ __device__ Primitives(
+      int tid, int nthreads, int const *recvPeers, int const *sendPeers,
+      void const *inputBuf, void *outputBuf, uint64_t redOpArg, uint8_t group=0,
+      uint8_t connIndexRecv = 0, uint8_t connIndexSend = 0, struct ncclWorkElem* e = nullptr
+    ):
+    tid(tid), nthreads(nthreads), tidInBlock(threadIdx.x), group(group),
+    stepSize(ncclShmem.comm.buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/sizeof(T)) {
+
+    // For send operations, we need an extra warp to overlap the threadfence and the copy
+    barriers = &ncclShmem.groups[group].barrier;
+    barrier_next = ncclShmem.groups[group].barrier_next;
+    this->nworkers = nthreads;
+
+    int nrecv=0, nsend=0;
+    while (nrecv < MaxRecv && recvPeers[nrecv] != -1) nrecv++;
+    while (nsend < MaxSend && sendPeers[nsend] != -1) nsend++;
+    this->fan = Fan(nrecv, nsend);
+
+    constexpr int ThreadPerSync = 8;
+    static_assert(MaxSend <= ThreadPerSync && MaxRecv <= ThreadPerSync, "Not enough threads to cover all peers");
+
+    int g = tid / ThreadPerSync;
+    int ng = nthreads / ThreadPerSync;
+    index = tid % ThreadPerSync;
+    flags = 0;
+    if (g == 0) {
+      if (index < nrecv) flags |= RoleWaitRecv;
+      if (index == nrecv) flags |= RoleInput;
+    } else if (g == 1) {
+      if (index < nsend) flags |= RoleWaitSend;
+      if (index == nsend) flags |= RoleOutput;
+    } else if (g == ng - 2) {
+      if (index < nrecv) flags |= RolePostRecv;
+    } else if (g == ng - 1) {
+      if (index < nsend) flags |= RolePostSend;
+    }
+
+    int peer = 0;
+    if (flags & (RoleWaitRecv|RolePostRecv)) peer = recvPeers[index];
+    if (flags & (RoleWaitSend|RolePostSend)) peer = sendPeers[index];
+
+    loadRecvConn(ncclShmem.channel.peers[peer], connIndexRecv, e);
+    loadSendConn(ncclShmem.channel.peers[peer], connIndexSend, e);
+
+    setDataPtrs(inputBuf, outputBuf, redOpArg, (struct ncclWorkElemReg*)e);
+  }
+
+  __forceinline__ __device__ ~Primitives() {
+    // Ensure ncclShmem.groups[].send/recvConns are available
+    if (!(flags & ThreadsSynced))
+      barrier();
+    // Save steps for the next operation
+    if (flags & (RolePostSend|RolePostRecv)) {
+      auto *conns = (flags & RolePostSend) ? ncclShmem.groups[group].sendConns : ncclShmem.groups[group].recvConns;
+      conns[index]->step = step;
+    }
+    // Make sure all threads are done writing back conn->step and done using
+    // ncclShmem.groups[group]
+    barrier();
+  }
+
+  __device__ void setDataPtrs(void const *inputBuf, void *outputBuf, uint64_t redOpArg, struct ncclWorkElemReg* e) {
+    if (flags & RoleInput) {
+      userBuff = (T*)inputBuf;
+      ncclShmem.redOpArgs[0] = redOpArg;  // scaler for local input
+    }
+    if (flags & RoleOutput) userBuff = (T*)outputBuf;
+    bool recvProvider = flags == (flags|RoleWaitRecv|DirectWrite);
+    bool sendAcceptor = flags == (flags|RoleWaitSend|DirectWrite);
+    bool sendProvider = flags == (flags|RoleWaitSend|DirectRead); // sender provides direct buffer (to be fetched)
+    bool recvAcceptor = flags == (flags|RoleWaitRecv|DirectRead); // receiver accepts direct buffer
+    int regUsed = e != nullptr ? e->elem.regUsed : 0;
+
+    if (Direct && recvProvider) {
+      int spins = 0;
+      void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
+      // Wait for consumer to consume previous value before trampling it.
+      while ((void *)atomicAdd((unsigned long long *) slot,0) != nullptr && !checkAbort(spins));
+      directBuff = (T*)outputBuf;
+      // Encode pointer by XOR'ing against some address they definitely wouldn't send
+      // since we want to allow them sending us nullptr while not colliding with
+      // the empty slot value.
+      *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+    }
+    if (Direct && sendAcceptor) {
+      int spins = 0;
+      void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
+      void *ptr;
+      while (true) {
+        ptr = (void *)atomicAdd((unsigned long long *) slot,0);
+        if (ptr != nullptr || checkAbort(spins)) break;
+      }
+      directBuff = regUsed ? (T*)(e->dnOutputs[index]) :
+                   reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
+      *slot = nullptr;
+    }
+    if (Direct && sendProvider) {
+      int spins = 0;
+      void *volatile *slot = ncclShmem.groups[group].sendConns[index]->ptrExchange;
+      volatile uint64_t* argSlot0 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange;
+      volatile uint64_t* argSlot1 = ncclShmem.groups[group].sendConns[index]->redOpArgExchange+1;
+      // Wait for consumer to consume previous value before trampling it.
+      while (((void *)atomicAdd((unsigned long long *) slot,0) != nullptr || *argSlot0 != 0 || *argSlot1 !=0) && !checkAbort(spins));
+      // If there is no recv, then we are directly pulling from input buffer (e.g. directScatter)
+      // Otherwise, we are pulling from output buffer (e.g. recvCopyDirectSend)
+      directBuff = MaxRecv == 0 ? (T*)inputBuf : (T*)outputBuf;
+      // Exchange pre-scalers for use in direct pull
+      *argSlot0 = (uint64_t(1)<<32) | (uint32_t)redOpArg;
+      *argSlot1 = (uint64_t(1)<<32) | (uint32_t)(redOpArg>>32);
+      // Encode pointer by XOR'ing against some address they definitely wouldn't send
+      // since we want to allow them sending us nullptr while not colliding with
+      // the empty slot value.
+      *slot = reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(directBuff) ^ reinterpret_cast<uintptr_t>(slot));
+    }
+    if (Direct && recvAcceptor) {
+      int spins = 0;
+      void *volatile *slot = ncclShmem.groups[group].recvConns[index]->ptrExchange;
+      volatile uint64_t* argSlot0 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange;
+      volatile uint64_t* argSlot1 = ncclShmem.groups[group].recvConns[index]->redOpArgExchange+1;
+      void *ptr;
+      while (true) {
+        ptr = (void *)atomicAdd((unsigned long long *) slot,0);
+        if (ptr != nullptr || checkAbort(spins)) break;
+      }
+      directBuff = regUsed ? (T*)(MaxSend == 0 ? e->upOutputs[index] : e->dnInputs[index]) :
+                   reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(ptr) ^ reinterpret_cast<uintptr_t>(slot));
+      if (MaxSend != 0) { // reduce group rather than gather group
+        // Store scalers for remote inputs
+        uint64_t arg0, arg1;
+        while (true) {
+          arg0 = *argSlot0;
+          arg1 = *argSlot1;
+          if ((arg0 != 0 && arg1 != 0) || checkAbort(spins)) break;
+        }
+        ncclShmem.redOpArgs[1+index] = ((arg1 & 0xffffffff)<<32) | (arg0 & 0xffffffff);
+      }
+      *argSlot0 = 0; *argSlot1 = 0;
+      *slot = nullptr;
+    }
+  }
+
+  __device__ void moveDataPtrs(intptr_t delta) {
+    if (flags & (RoleInput|RoleOutput))
+      userBuff += delta;
+  }
+
+  // Set MSCCL data pointers
+  __device__ __forceinline__ void setDataPtrs(void const *inputBuf, void *outputBuf) {
+    if (flags & RoleInput) userBuff = (T*)inputBuf;
+    if (flags & RoleOutput) userBuff = (T*)outputBuf;
+  }
+
+  __device__ __forceinline__ void send(intptr_t inpIx, int eltN) {
+    genericOp<0, 0, 0, 1, Input, -1>(inpIx, -1, eltN, false);
+  }
+  __device__ __forceinline__ void sendFromOutput(intptr_t outIx, int eltN) {
+    genericOp<0, 0, 0, 1, Output, -1>(outIx, -1, eltN, false);
+  }
+  __device__ __forceinline__ void directSend(intptr_t inpIx, intptr_t outIx, int eltN) {
+    genericOp<0, 1, 0, 1, Input, -1>(inpIx, outIx, eltN, false);
+  }
+  __device__ __forceinline__ void directSendFromOutput(intptr_t outIx, int eltN) {
+    genericOp<0, 1, 0, 1, Output, -1>(outIx, outIx, eltN, false);
+  }
+
+  __device__ __forceinline__ void recv(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 0, 1, 0, -1, Output>(-1, outIx, eltN, postOp);
+  }
+  __device__ __forceinline__ void directRecv(intptr_t outIx, int eltN) {
+    genericOp<1, 0, 1, 0, -1, Output>(-1, outIx, eltN, /*postOp=*/false);
+  }
+
+  __device__ __forceinline__ void copySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 0, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
+  __device__ __forceinline__ void directCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 1, 0, 1, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
+
+  __device__ __forceinline__ void recvSend(int eltN, bool postOp=false) {
+    genericOp<0, 0, 1, 1, -1, -1>(-1, -1, eltN, postOp);
+  }
+  __device__ __forceinline__ void recvCopySend(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 0, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
+  }
+  __device__ __forceinline__ void directRecvCopySend(intptr_t outIx, int eltN) {
+    genericOp<1, 1, 1, 1, -1, Output>(-1, outIx, eltN, false);
+  }
+  __device__ __forceinline__ void recvCopyDirectSend(intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 1, 1, 1, -1, Output>(-1, outIx, eltN, postOp);
+  }
+
+  __device__ __forceinline__ void recvReduceCopy(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 0, 1, 0, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
+
+  __device__ __forceinline__ void recvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
+    genericOp<0, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
+  }
+  __device__ __forceinline__ void directRecvReduceSend(intptr_t inpIx, int eltN, bool postOp=false) {
+    genericOp<1, 0, 1, 1, Input, -1>(inpIx, -1, eltN, postOp);
+  }
+
+  __device__ __forceinline__ void recvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    genericOp<0, 0, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
+  __device__ __forceinline__ void directRecvReduceCopySend(intptr_t inpIx, intptr_t outIx, int eltN, bool postOp=false) {
+    // Direct is only for the send part
+    genericOp<0, 1, 1, 1, Input, Output>(inpIx, outIx, eltN, postOp);
+  }
+
+  __device__ __forceinline__ void
+  scatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
+    ScatterGatherOp<0, 0, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
+  }
+  __device__ __forceinline__ void
+  directScatter(intptr_t inpIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
+    ScatterGatherOp<0, 1, 0, 1>(inpIx, -1, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
+  }
+
+  __device__ __forceinline__ void
+  gather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift, bool postOp=false) {
+    ScatterGatherOp<0, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, postOp);
+  }
+  __device__ __forceinline__ void
+  directGather(intptr_t outIx, int totalElem, int peerElem, int peerOffset, int skip, int shift) {
+    ScatterGatherOp<1, 0, 1, 0>(-1, outIx, totalElem, peerElem, peerOffset, skip, shift, /*postOp=*/false);
+  }
+
+  // MSCCL primitives
+  __device__ __forceinline__ void sendWithBarrier(intptr_t inpIx, int eltN) {
+    send(inpIx, eltN);
+  }
+  __device__ __forceinline__ void localCopy(T* srcs, T* dsts, int eltN) {
+    return mscclGenericOp<0,1,0,0>(&srcs, 1, &dsts, 1, eltN);
+  }
+  __device__ __forceinline__ void reduce(T** srcs, int nsrcs, T** dsts, int ndsts, int eltN) {
+    if (nsrcs == 1) {
+      return mscclGenericOp<1,0,0,0>(srcs, 1, dsts, 1, eltN);
+    } else {
+      return mscclGenericOp<1,0,1,0>(srcs, nsrcs, dsts, 1, eltN);
+    }
+  }
+};
--- a/rccl/src/collectives/device/reduce.cu
+++ b/rccl/src/collectives/device/reduce.cu
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/*This file is now generated in CMake*/
+
+// #include "reduce.h"
+// #include "common.h"
+// #include "collectives.h"
+
+// IMPL_COLL_R(Reduce);
--- a/rccl/src/collectives/device/reduce.h
+++ b/rccl/src/collectives/device/reduce.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "devcomm.h"
+#include "collectives.h"
+#include "primitives.h"
+
+namespace {
+  template<typename T, typename RedOp, typename Proto>
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+  __device__ void runRing(ncclWorkElem *args) {
+#else
+  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
+#endif
+    const int tid = threadIdx.x;
+    const int nthreads = args->nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
+    ncclRing *ring = &ncclShmem.channel.ring;
+    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCE_CHUNKSTEPS : 1));
+    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T)));
+    const int nranks = ncclShmem.comm.nRanks;
+    const ssize_t loopSize = nChannels*chunkSize;
+    const ssize_t size = args->count;
+    const int rank = ncclShmem.comm.rank;
+    const int prevRank = ring->userRanks[nranks-1];
+    const int root = args->root;
+
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
+
+#ifdef HYGON_SDMA_FEATURE
+    prims.ringIx = ring->index;
+    INIT_PRIMS_SDMA(prims, args);
+#endif
+    auto calcChunkSize = [&]__device__(ssize_t gridOffset)->int {
+      int realChunkSize;
+      if (Proto::Id == NCCL_PROTO_SIMPLE) {
+        realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
+        realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+      }
+      else if (Proto::Id == NCCL_PROTO_LL)
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
+      else if (Proto::Id == NCCL_PROTO_LL128)
+        realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
+      return realChunkSize;
+    };
+
+    if (prevRank == root) {
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = calcChunkSize(gridOffset);
+        ssize_t offset = gridOffset + bid*realChunkSize;
+        int nelem = min(realChunkSize, size-offset);
+        prims.send(offset, nelem);
+      }
+    }
+    else if (rank == root) {
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = calcChunkSize(gridOffset);
+        ssize_t offset = gridOffset + bid*realChunkSize;
+        int nelem = min(realChunkSize, size-offset);
+        prims.recvReduceCopy(offset, offset, nelem, /*postOp=*/true);
+      }
+    }
+    else {
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        int realChunkSize = calcChunkSize(gridOffset);
+        ssize_t offset = gridOffset + bid*realChunkSize;
+        int nelem = min(realChunkSize, size-offset);
+        prims.recvReduceSend(offset, nelem);
+      }
+    }
+  }
+}
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    using Proto = ProtoSimple<REDUCE_CHUNKSTEPS/REDUCE_SLICESTEPS, REDUCE_SLICESTEPS>;
+    runRing<T, RedOp, Proto>(args);
+  }
+};
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    runRing<T, RedOp, ProtoLL>(args);
+  }
+};
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncReduce, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    runRing<T, RedOp, ProtoLL128>(args);
+  }
+};
--- a/rccl/src/collectives/device/reduce_kernel.h
+++ b/rccl/src/collectives/device/reduce_kernel.h
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+
+#ifndef NCCL_REDUCE_KERNEL_H_
+#define NCCL_REDUCE_KERNEL_H_
+
+#include "op128.h"
+#include <limits>
+#include <type_traits>
+
+////////////////////////////////////////////////////////////////////////////////
+// The reduction function classes. All classes must:
+//  1. Expose the `EltType` typedef.
+//  2. Have constructor taking no arguments (default constructible).
+//  3. Have constructor taking `uint64_t opArg`.
+
+template<typename T>
+struct FuncNull { using EltType = T; __device__ FuncNull(uint64_t opArg=0) {}; };
+template<typename T>
+struct FuncSum  { using EltType = T; __device__ FuncSum(uint64_t opArg=0) {}; };
+template<typename T>
+struct FuncProd { using EltType = T; __device__ FuncProd(uint64_t opArg=0) {}; };
+template<typename T>
+struct FuncMin  { using EltType = T; __device__ FuncMin(uint64_t opArg=0) {}; };
+template<typename T>
+struct FuncMax  { using EltType = T; __device__ FuncMax(uint64_t opArg=0) {}; };
+
+template<typename T> struct FuncPreMulSum;
+template<typename T> struct FuncSumPostDiv;
+
+////////////////////////////////////////////////////////////////////////////////
+// Trait classes for reduction functions. Given a function (FuncSum, etc.)
+// and a number of elements in a pack, will reduce, preOp, or postOp a pack
+// of elements. These classes are intended to be specialized for specific
+// combinations of reduction function and pack size.
+
+template<typename Fn, int EltPerPack>
+struct Apply_Reduce /*{
+  static BytePack<EltPerPack*sizeof(T)> reduce(
+    Fn fn, BytePack<EltPerPack*sizeof(T)> a, BytePack<EltPerPack*sizeof(T)> b
+  );
+}*/;
+template<typename Fn, int EltPerPack>
+struct Apply_PreOp/*{
+  static constexpr bool IsIdentity;
+  static BytePack<EltPerPack*sizeof(T)> preOp(Fn fn, BytePack<EltPerPack*sizeof(T)> a);
+}*/;
+template<typename Fn, int EltPerPack>
+struct Apply_PostOp/*{
+  static constexpr bool IsIdentity;
+  static BytePack<EltPerPack*sizeof(T)> postOp(Fn fn, BytePack<EltPerPack*sizeof(T)> a);
+}*/;
+template<typename Fn>
+struct LoadMultimem_BigPackSize/*{
+  // If non-zero, then this and sizeof(T) are valid pack sizes for LoadMultimem,
+  // otherwise there are no valid pack sizes for LoadMultimem.
+  static constexpr int BigPackSize = 0;
+}*/;
+template<typename Fn, int BytePerPack>
+struct Apply_LoadMultimem/*{
+  static BytePack<BytePerPack> load(Fn fn, uintptr_t addr);
+}*/;
+
+////////////////////////////////////////////////////////////////////////////////
+// Public API for calling the trait classes. These take the data elements as a
+// pack of any type, which could be a BytePack<?> or any integral type (uint64_t,
+// uint32_t, etc.), and will return a new pack where each element has been
+// transformed appropriately.
+
+template<typename Fn, typename Pack>
+__device__ __forceinline__ Pack applyReduce(Fn fn, Pack a, Pack b) {
+  return fromPack<Pack>(
+    Apply_Reduce<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
+      ::reduce(fn, toPack(a), toPack(b))
+  );
+}
+
+template<typename Fn, typename Pack>
+__device__ __forceinline__ Pack applyPreOp(Fn fn, Pack a) {
+  return fromPack<Pack>(
+    Apply_PreOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
+      ::preOp(fn, toPack(a))
+  );
+}
+
+template<typename Fn, typename Pack>
+__device__ __forceinline__ Pack applyPostOp(Fn fn, Pack a) {
+  return fromPack<Pack>(
+    Apply_PostOp<Fn, BytePackOf<Pack>::Size/sizeof(typename Fn::EltType)>
+      ::postOp(fn, toPack(a))
+  );
+}
+
+template<typename Fn, int BytePerPack>
+__device__ __forceinline__ BytePack<BytePerPack> applyLoadMultimem(Fn fn, uintptr_t addr) {
+  return Apply_LoadMultimem<Fn, BytePerPack>::load(fn, addr);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply_Reduce
+
+// Nonsensical base case
+template<typename Fn>
+struct Apply_Reduce<Fn, /*EltPerPack=*/0> {
+  __device__ static BytePack<0> reduce(Fn fn, BytePack<0> a, BytePack<0> b) {
+    return  {};
+  }
+};
+
+// General recursive definition (EltPerPack > 1). This is how we iterate over
+// all elements in a pack of any size, by breaking it into halves. Eventually
+// we'll hit a base case (a more specific template specialization which takes
+// precedence).
+template<typename Fn, int EltPerPack>
+struct Apply_Reduce {
+  template<int Size>
+  __device__ static BytePack<Size> reduce(Fn fn, BytePack<Size> a, BytePack<Size> b) {
+    a.half[0] = Apply_Reduce<Fn, EltPerPack/2>::reduce(fn, a.half[0], b.half[0]);
+    a.half[1] = Apply_Reduce<Fn, EltPerPack/2>::reduce(fn, a.half[1], b.half[1]);
+    return a;
+  }
+};
+
+// Base case definitions (EltPerPack == 1)
+template<typename T>
+struct Apply_Reduce<FuncNull<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+    return a;
+  }
+};
+template<typename T>
+struct Apply_Reduce<FuncSum<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+    return toPack<T>(fromPack<T>(a) + fromPack<T>(b));
+  }
+};
+template<typename T>
+struct Apply_Reduce<FuncProd<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncProd<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+    return toPack<T>(fromPack<T>(a) * fromPack<T>(b));
+  }
+};
+template<typename T>
+struct Apply_Reduce<FuncMin<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncMin<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+    return toPack<T>(min(fromPack<T>(a), fromPack<T>(b)));
+  }
+};
+template<typename T>
+struct Apply_Reduce<FuncMax<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncMax<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+    return toPack<T>(max(fromPack<T>(a), fromPack<T>(b)));
+  }
+};
+
+// Optimizations for specfic types and element count combinations:
+template<>
+struct Apply_Reduce<FuncSum<uint8_t>, /*EltPerPack=*/4> {
+  __device__ static BytePack<4> reduce(FuncSum<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
+    constexpr uint32_t lo = 0x00ff00ff;
+    constexpr uint32_t hi = ~lo;
+    uint32_t x = a.u32;
+    uint32_t y = b.u32;
+    a.u32 = (((x&lo) + (y&lo))&lo) + (((x&hi) + (y&hi))&hi);
+    return a;
+  }
+};
+template<>
+struct Apply_Reduce<FuncSum<int8_t>, /*EltPerPack=*/4> {
+  __device__ static BytePack<4> reduce(FuncSum<int8_t> fn, BytePack<4> a, BytePack<4> b) {
+    return Apply_Reduce<FuncSum<uint8_t>, 4>::reduce(FuncSum<uint8_t>(), a, b);
+  }
+};
+
+#if 300 <= __CUDA_ARCH__ && __CUDA_ARCH__ < 500
+  template<>
+  struct Apply_Reduce<FuncMin<uint8_t>, /*EltPerPack=*/4> {
+    __device__ static BytePack<4> reduce(FuncMin<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
+      uint32_t z=0;
+      asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
+      return a;
+    }
+  };
+  template<>
+  struct Apply_Reduce<FuncMin<int8_t>, /*EltPerPack=*/4> {
+    __device__ static BytePack<4> reduce(FuncMin<int8_t> fn, BytePack<4> a, BytePack<4> b) {
+      int32_t z=0;
+      asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
+      return a;
+    }
+  };
+  template<>
+  struct Apply_Reduce<FuncMax<uint8_t>, /*EltPerPack=*/4> {
+    __device__ static BytePack<4> reduce(FuncMax<uint8_t> fn, BytePack<4> a, BytePack<4> b) {
+      uint32_t z=0;
+      asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
+      return a;
+    }
+  };
+  template<>
+  struct Apply_Reduce<FuncMax<int8_t>, /*EltPerPack=*/4> {
+    __device__ static BytePack<4> reduce(FuncMax<int8_t> fn, BytePack<4> a, BytePack<4> b) {
+      int32_t z=0;
+      asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(a.u32) : "r"(a.u32), "r"(b.u32), "r"(z));
+      return a;
+    }
+  };
+#endif
+
+#define SPECIALIZE_REDUCE(Fn, T, EltPerPack, Vec, expr_of_x_y) \
+  template<> \
+  struct Apply_Reduce<Fn<T>, EltPerPack> { \
+    __device__ __forceinline__ static BytePack<sizeof(Vec)> reduce( \
+        Fn<T> fn, BytePack<sizeof(Vec)> a, BytePack<sizeof(Vec)> b \
+      ) { \
+      Vec x = fromPack<Vec>(a); \
+      Vec y = fromPack<Vec>(b); \
+      return toPack<Vec>(expr_of_x_y); \
+    } \
+  };
+
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+  SPECIALIZE_REDUCE(FuncSum, half, 1, half, __hadd(x, y))
+  SPECIALIZE_REDUCE(FuncSum, half, 2, half2, __hadd2(x, y))
+  SPECIALIZE_REDUCE(FuncProd, half, 1, half, __hmul(x, y))
+  SPECIALIZE_REDUCE(FuncProd, half, 2, half2, __hmul2(x, y))
+#else
+  SPECIALIZE_REDUCE(FuncSum, half, 1, half, __float2half(__half2float(x) + __half2float(y)))
+  SPECIALIZE_REDUCE(FuncProd, half, 1, half, __float2half(__half2float(x) * __half2float(y)))
+#endif
+
+#if __CUDA_ARCH__ >= 800
+  SPECIALIZE_REDUCE(FuncMin, half, 1, half, __hmin(x, y))
+  SPECIALIZE_REDUCE(FuncMin, half, 2, half2, __hmin2(x, y))
+  SPECIALIZE_REDUCE(FuncMax, half, 1, half, __hmax(x, y))
+  SPECIALIZE_REDUCE(FuncMax, half, 2, half2, __hmax2(x, y))
+#else
+  SPECIALIZE_REDUCE(FuncMin, half, 1, half, __float2half(fminf(__half2float(x), __half2float(y))))
+  SPECIALIZE_REDUCE(FuncMax, half, 1, half, __float2half(fmaxf(__half2float(x), __half2float(y))))
+#endif
+
+#if defined(RCCL_BFLOAT16)
+#if __CUDA_ARCH__ >= 800
+  SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 1, __nv_bfloat16, __hadd(x, y))
+  SPECIALIZE_REDUCE(FuncSum, __nv_bfloat16, 2, __nv_bfloat162, __hadd2(x, y))
+  SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 1, __nv_bfloat16, __hmul(x, y))
+  SPECIALIZE_REDUCE(FuncProd, __nv_bfloat16, 2, __nv_bfloat162, __hmul2(x, y))
+  SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 1, __nv_bfloat16, __hmin(x, y))
+  SPECIALIZE_REDUCE(FuncMin, __nv_bfloat16, 2, __nv_bfloat162, __hmin2(x, y))
+  SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 1, __nv_bfloat16, __hmax(x, y))
+  SPECIALIZE_REDUCE(FuncMax, __nv_bfloat16, 2, __nv_bfloat162, __hmax2(x, y))
+#else
+  SPECIALIZE_REDUCE(FuncSum, rccl_bfloat16, 1, rccl_bfloat16, (rccl_bfloat16)((float)(x) + (float)(y)))
+  SPECIALIZE_REDUCE(FuncProd, rccl_bfloat16, 1, rccl_bfloat16, (rccl_bfloat16)((float)(x) * (float)(y)))
+  SPECIALIZE_REDUCE(FuncMin, rccl_bfloat16, 1, rccl_bfloat16, (rccl_bfloat16)(fminf((float)(x), (float)(y))))
+  SPECIALIZE_REDUCE(FuncMax, rccl_bfloat16, 1, rccl_bfloat16, (rccl_bfloat16)(fmaxf((float)(x), (float)(y))))
+#endif
+#endif
+
+#undef SPECIALIZE_REDUCE
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply_PreOp
+
+// General recursive definition (EltPerPack > 1)
+template<typename Fn, int EltPerPack>
+struct Apply_PreOp {
+  static constexpr bool IsIdentity = Apply_PreOp<Fn, EltPerPack/2>::IsIdentity;
+  template<int Size>
+  __device__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
+    #if __cpp_if_constexpr
+    if constexpr(!IsIdentity) {
+    #else
+    if (!IsIdentity) {
+    #endif
+      // The `if (!IsIdentity)` condition is not strictly necessary, but it may help
+      // compiler in that it won't have to tear a register apart for no reason
+      // just to put it back together again.
+      a.half[0] = Apply_PreOp<Fn, EltPerPack/2>::preOp(fn, a.half[0]);
+      a.half[1] = Apply_PreOp<Fn, EltPerPack/2>::preOp(fn, a.half[1]);
+    }
+    return a;
+  }
+};
+// Base case definition (EltPerPack == 1), by default is identity function.
+template<typename Fn>
+struct Apply_PreOp<Fn, /*EltPerPack=*/1> {
+  static constexpr bool IsIdentity = true;
+  template<int Size>
+  __device__ static BytePack<Size> preOp(Fn fn, BytePack<Size> a) {
+    return a;
+  }
+};
+// Base case definition (EltPerPack == 0), is nonsense!
+template<typename Fn>
+struct Apply_PreOp<Fn, /*EltPerPack=*/0> {
+  static constexpr bool IsIdentity = true;
+  __device__ static BytePack<0> preOp(Fn fn, BytePack<0> a) {
+    return {};
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply_PostOp
+
+// General recursive definition (EltPerPack > 1)
+template<typename Fn, int EltPerPack>
+struct Apply_PostOp {
+  static constexpr bool IsIdentity = Apply_PostOp<Fn, EltPerPack/2>::IsIdentity;
+  template<int Size>
+  __device__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
+    #if __cpp_if_constexpr
+    if constexpr(!IsIdentity) {
+    #else
+    if (!IsIdentity) {
+    #endif
+      // The `if (!IsIdentity)` condition is not strictly necessary, but it may help
+      // compiler in that it won't have to tear a register apart for no reason
+      // just to put it back together again.
+      a.half[0] = Apply_PostOp<Fn, EltPerPack/2>::postOp(fn, a.half[0]);
+      a.half[1] = Apply_PostOp<Fn, EltPerPack/2>::postOp(fn, a.half[1]);
+    }
+    return a;
+  }
+};
+// Base case definition (EltPerPack == 1), by default is identity function.
+template<typename Fn>
+struct Apply_PostOp<Fn, /*EltPerPack=*/1> {
+  static constexpr bool IsIdentity = true;
+  template<int Size>
+  __device__ static BytePack<Size> postOp(Fn fn, BytePack<Size> a) {
+    return a;
+  }
+};
+// Base case definition (EltPerPack == 0), is nonsense!
+template<typename Fn>
+struct Apply_PostOp<Fn, /*EltPerPack=*/0> {
+  static constexpr bool IsIdentity = true;
+  __device__ static BytePack<0> postOp(Fn fn, BytePack<0> a) {
+    return {};
+  }
+};
+
+
+////////////////////////////////////////////////////////////////////////////////
+// FuncPreMulSum
+
+// General definition for all integral types, float, and double.
+template<typename T>
+struct FuncPreMulSum {
+  using EltType = T;
+  T scalar;
+  __device__ FuncPreMulSum(uint64_t opArg=0) {
+    union { uint64_t u64; T val; };
+    u64 = opArg;
+    scalar = val;
+  }
+};
+
+template<>
+struct FuncPreMulSum<half> {
+  using EltType = half;
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+  half2 scalar;
+  __device__ FuncPreMulSum(uint64_t opArg=0) {
+    union { uint64_t u64; half val; };
+    u64 = opArg;
+    scalar.x = val;
+    scalar.y = val;
+  }
+#else
+  float scalar;
+  __device__ FuncPreMulSum(uint64_t opArg=0) {
+    union { uint64_t u64; half val; };
+    u64 = opArg;
+    scalar = __half2float(val);
+  }
+#endif
+};
+
+#if defined(RCCL_BFLOAT16)
+  template<>
+  struct FuncPreMulSum<rccl_bfloat16> {
+    using EltType = rccl_bfloat16;
+  #if __CUDA_ARCH__ >= 800
+    __nv_bfloat162 scalar;
+    __device__ FuncPreMulSum(uint64_t opArg=0) {
+      union { uint64_t u64; __nv_bfloat16 val; };
+      u64 = opArg;
+      scalar.x = val;
+      scalar.y = val;
+    }
+  #else
+    float scalar;
+    __device__ FuncPreMulSum(uint64_t opArg=0) {
+      union { uint64_t u64; rccl_bfloat16 val; };
+      u64 = opArg;
+      scalar = (float)(val);
+    }
+  #endif
+  };
+#endif
+
+template<typename T>
+struct Apply_Reduce<FuncPreMulSum<T>, /*EltPerPack=*/1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+    // FuncPreMulSum reduce dispatches to FuncSum.
+    return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
+  }
+};
+
+// PreOp of FuncPreMulSum for integral types, float, and double.
+template<typename T>
+struct Apply_PreOp<FuncPreMulSum<T>, /*EltPerPack=*/1> {
+  static constexpr bool IsIdentity = false;
+  __device__ static BytePack<sizeof(T)> preOp(FuncPreMulSum<T> fn, BytePack<sizeof(T)> a) {
+    return toPack<T>(fromPack<T>(a) * fn.scalar);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply_PreOp of FuncPreMulSum for float16.
+
+template<>
+struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/1> {
+  static constexpr bool IsIdentity = false;
+  __device__ static BytePack<sizeof(half)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half)> a) {
+    #if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+      return toPack<half>(__hmul(fromPack<half>(a), fn.scalar.x));
+    #else
+      return toPack<half>(__float2half(__half2float(fromPack<half>(a)) * fn.scalar));
+    #endif
+  }
+};
+#if __CUDA_ARCH__ >= 530 && __CUDA_ARCH__ != 610
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<half>, /*EltPerPack=*/2> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(half2)> preOp(FuncPreMulSum<half> fn, BytePack<sizeof(half2)> a) {
+      return toPack<half2>(__hmul2(fromPack<half2>(a), fn.scalar));
+    }
+  };
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply_PreOp of FuncPreMulSum for bfloat16.
+
+#if defined(RCCL_BFLOAT16)
+  template<>
+  struct Apply_PreOp<FuncPreMulSum<rccl_bfloat16>, /*EltPerPack=*/1> {
+    static constexpr bool IsIdentity = false;
+    __device__ static BytePack<sizeof(rccl_bfloat16)> preOp(
+        FuncPreMulSum<rccl_bfloat16> fn, BytePack<sizeof(rccl_bfloat16)> a
+      ) {
+      #if __CUDA_ARCH__ >= 800
+        return toPack<__nv_bfloat16>(__hmul(fromPack<__nv_bfloat16>(a), fn.scalar.x));
+      #else
+        return toPack<rccl_bfloat16>((rccl_bfloat16)((float)(fromPack<rccl_bfloat16>(a)) * fn.scalar));
+      #endif
+    }
+  };
+  #if __CUDA_ARCH__ >= 800
+    template<>
+    struct Apply_PreOp<FuncPreMulSum<rccl_bfloat16>, /*EltPerPack=*/2> {
+      static constexpr bool IsIdentity = false;
+      __device__ static BytePack<sizeof(__nv_bfloat162)> preOp(
+          FuncPreMulSum<__nv_bfloat16> fn, BytePack<sizeof(__nv_bfloat162)> a
+        ) {
+        return toPack<__nv_bfloat162>(__hmul2(fromPack<__nv_bfloat162>(a), fn.scalar));
+      }
+    };
+  #endif
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// FuncSumPostDiv
+
+template<typename T>
+struct IsFloatingPoint: std::false_type {};
+template<>
+struct IsFloatingPoint<half>: std::true_type {};
+#if defined(RCCL_BFLOAT16)
+template<>
+struct IsFloatingPoint<rccl_bfloat16>: std::true_type {};
+#endif
+template<>
+struct IsFloatingPoint<float>: std::true_type {};
+template<>
+struct IsFloatingPoint<double>: std::true_type {};
+
+template<typename T, bool IsFloating=IsFloatingPoint<T>::value>
+struct FuncSumPostDiv_IntOnly;
+
+template<typename T>
+struct FuncSumPostDiv: FuncSumPostDiv_IntOnly<T> {
+  __device__ FuncSumPostDiv(uint64_t opArg=0):
+    FuncSumPostDiv_IntOnly<T>(opArg) {
+  }
+};
+
+template<typename T>
+struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/false>: FuncSum<T> {
+  using EltType = T;
+  int divisor;
+  __device__ FuncSumPostDiv_IntOnly(uint64_t opArg=0): divisor(opArg) {}
+};
+
+template<typename T>
+struct FuncSumPostDiv_IntOnly<T, /*IsFloating=*/true> {
+  static_assert(sizeof(T)!=sizeof(T), "FuncSumPostDiv is only for implementing ncclAvg on integral types.");
+};
+
+template<typename T>
+struct Apply_Reduce<FuncSumPostDiv<T>, /*EltPerPack=*/1>:
+    Apply_Reduce<FuncSum<T>, 1> {
+  __device__ static BytePack<sizeof(T)> reduce(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a, BytePack<sizeof(T)> b) {
+    // FuncSumPostDiv reduce dispatches to FuncSum.
+    return Apply_Reduce<FuncSum<T>, 1>::reduce(FuncSum<T>(), a, b);
+  }
+};
+
+template<typename T>
+struct Apply_PostOp<FuncSumPostDiv<T>, /*EltPerPack=*/1> {
+  static constexpr bool IsIdentity = false;
+  __device__ static BytePack<sizeof(T)> postOp(FuncSumPostDiv<T> fn, BytePack<sizeof(T)> a) {
+    return toPack<T>(fromPack<T>(a) / fn.divisor);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Apply_LoadMultimem
+
+#define SIZEOF_BytePack_field_u16 2
+#define PTX_REG_BytePack_field_u16 "h"
+
+#define SIZEOF_BytePack_field_u32 4
+#define PTX_REG_BytePack_field_u32 "r"
+
+#define SIZEOF_BytePack_field_u64 8
+#define PTX_REG_BytePack_field_u64 "l"
+
+#define DEFINE_Apply_LoadMultimem(Fn, T, op, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<Fn<T>, SIZEOF_BytePack_field_##pack_field> { \
+    static constexpr int PackSize = SIZEOF_BytePack_field_##pack_field; \
+    __device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
+      BytePack<PackSize> ans; \
+      asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
+        : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field) \
+        : "l"(addr)); \
+      return ans; \
+    } \
+  };
+#define DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<Fn<T>, 4*(SIZEOF_BytePack_field_##pack_field)> { \
+    static constexpr int PackSize = 4*(SIZEOF_BytePack_field_##pack_field); \
+    __device__ static BytePack<PackSize> load(Fn<T> fn, uintptr_t addr) { \
+      BytePack<PackSize> ans; \
+      asm("multimem.ld_reduce.relaxed.sys.global." #op ".v4." #ptx_ty " {%0,%1,%2,%3}, [%4];" \
+        : "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[0]), \
+          "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[1]), \
+          "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[2]), \
+          "=" PTX_REG_BytePack_field_##pack_field(ans.pack_field[3]) \
+        : "l"(addr)); \
+      return ans; \
+    } \
+  };
+#define DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(Fn, T, op, ptx_ty, pack_field) \
+  DEFINE_Apply_LoadMultimem_v4(Fn, T, op, ptx_ty, pack_field) \
+  template<> \
+  struct Apply_LoadMultimem<Fn<T>, sizeof(T)> { \
+    __device__ static BytePack<sizeof(T)> load(Fn<T> fn, uintptr_t addr) { \
+      BytePack<2*sizeof(T)> tmp; \
+      asm("multimem.ld_reduce.relaxed.sys.global." #op "." #ptx_ty " %0, [%1];" \
+        : "=" PTX_REG_BytePack_field_##pack_field(tmp.pack_field) \
+        : "l"(addr & -uintptr_t(sizeof(T)))); \
+      return tmp.half[(addr/sizeof(T))%2]; \
+    } \
+  };
+
+template<typename Fn, int BytePerPack>
+struct Apply_LoadMultimem {
+  __device__ static BytePack<BytePerPack> load(Fn fn, uintptr_t addr) {
+    //__trap();
+    return {};
+  }
+};
+
+#if __CUDA_ARCH__ >= 900 && CUDART_VERSION >= 12010
+  template<typename Fn>
+  struct LoadMultimem_BigPackSize {
+    using T = typename Fn::EltType;
+    static constexpr bool IsSum = std::is_same<Fn, FuncSum<T>>::value ||
+                                  std::is_same<Fn, FuncPreMulSum<T>>::value ||
+                                  std::is_same<Fn, FuncSumPostDiv<T>>::value;
+    static constexpr bool IsMinOrMax = std::is_same<Fn, FuncMin<T>>::value ||
+                                       std::is_same<Fn, FuncMax<T>>::value;
+    static constexpr bool IsFloat = IsFloatingPoint<T>::value;
+    static constexpr int BigPackSize =
+      IsFloat && IsSum && sizeof(T) < 8 ? 16 :
+      IsFloat && IsSum ? 8 :
+      IsFloat && IsMinOrMax && sizeof(T)==2 ? 16 :
+      !IsFloat && (IsSum||IsMinOrMax) && sizeof(T)>=4 ? sizeof(T) :
+      /*multimem.ld_reduce not supported:*/ 0;
+  };
+
+  DEFINE_Apply_LoadMultimem(FuncSum, uint32_t, add, u32, u32)
+  DEFINE_Apply_LoadMultimem(FuncMin, uint32_t, min, u32, u32)
+  DEFINE_Apply_LoadMultimem(FuncMax, uint32_t, max, u32, u32)
+
+  DEFINE_Apply_LoadMultimem(FuncSum, int32_t, add, s32, u32)
+  DEFINE_Apply_LoadMultimem(FuncMin, int32_t, min, s32, u32)
+  DEFINE_Apply_LoadMultimem(FuncMax, int32_t, max, s32, u32)
+
+  DEFINE_Apply_LoadMultimem(FuncSum, uint64_t, add, u64, u64)
+  DEFINE_Apply_LoadMultimem(FuncMin, uint64_t, min, u64, u64)
+  DEFINE_Apply_LoadMultimem(FuncMax, uint64_t, max, u64, u64)
+
+  DEFINE_Apply_LoadMultimem(FuncSum, int64_t, add, u64, u64)
+  DEFINE_Apply_LoadMultimem(FuncMin, int64_t, min, s64, u64)
+  DEFINE_Apply_LoadMultimem(FuncMax, int64_t, max, s64, u64)
+
+  DEFINE_Apply_LoadMultimem(FuncSum, float, add, f32, u32)
+  DEFINE_Apply_LoadMultimem_v4(FuncSum, float, add, f32, u32)
+
+  DEFINE_Apply_LoadMultimem(FuncSum, double, add, f64, u64)
+
+  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, half, add, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, half, min, f16x2, u32)
+  DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, half, max, f16x2, u32)
+
+  #if defined(__CUDA_BF16_TYPES_EXIST__)
+    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncSum, __nv_bfloat16, add, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMin, __nv_bfloat16, min, bf16x2, u32)
+    DEFINE_Apply_LoadMultimem_v4x2_and_subhalf(FuncMax, __nv_bfloat16, max, bf16x2, u32)
+  #endif
+#else
+  template<typename Fn>
+  struct LoadMultimem_BigPackSize {
+    static constexpr int BigPackSize = 0;
+  };
+#endif
+
+#undef DEFINE_Apply_LoadMultimem
+#undef DEFINE_Apply_LoadMultimem_v4
+#undef DEFINE_Apply_LoadMultimem_v4x2_and_subhalf
+#undef SIZEOF_BytePack_field_u64
+#undef PTX_REG_BytePack_field_u64
+#undef SIZEOF_BytePack_field_u32
+#undef PTX_REG_BytePack_field_u32
+#undef SIZEOF_BytePack_field_u16
+#undef PTX_REG_BytePack_field_u16
+
+#endif // REDUCE_KERNEL_H_
--- a/rccl/src/collectives/device/reduce_scatter.cu
+++ b/rccl/src/collectives/device/reduce_scatter.cu
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/*This file is now generated in CMake*/
+
+// #include "reduce_scatter.h"
+// #include "common.h"
+// #include "collectives.h"
+
+// IMPL_COLL_R(ReduceScatter);
--- a/rccl/src/collectives/device/reduce_scatter.h
+++ b/rccl/src/collectives/device/reduce_scatter.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "devcomm.h"
+#include "collectives.h"
+#include "primitives.h"
+
+namespace {
+  template<typename T, typename RedOp, typename Proto>
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+  __device__ void runRing(ncclWorkElem *args) {
+#else
+  __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) {
+#endif
+    const int tid = threadIdx.x;
+    const int nthreads = args->nWarps*WARP_SIZE;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
+    ncclRing *ring = &ncclShmem.channel.ring;
+    int const *ringRanks = ring->userRanks;
+    const ssize_t chunkSize = int(Proto::calcBytePerStep()/sizeof(T) * (Proto::Id == NCCL_PROTO_SIMPLE ? REDUCESCATTER_CHUNKSTEPS : 1));
+    // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
+    const ssize_t minChunkSizeLL128 = int(nthreads*(Proto::calcBytePerGrain()/sizeof(T))/2);
+    const int nranks = ncclShmem.comm.nRanks;
+    const ssize_t loopSize = nChannels*chunkSize;
+    const ssize_t size = args->count;
+
+#if defined (ENABLE_TIMELINE)
+    int elems = 0, totalElems = 0;
+    uint64_t clkStamp = 0ULL;
+    struct ncclDevComm* comm = &ncclShmem.comm;
+    uint64_t entryStamp = __builtin_amdgcn_s_memrealtime();
+    Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_REDUCE_SCATTER_ENTRY, 0, entryStamp, comm->cpuTimestamp); 
+#endif    
+
+    Primitives<T, RedOp, FanSymmetric<1>, 0, Proto, 0>
+      prims(tid, nthreads, &ring->prev, &ring->next, args->sendbuff, args->recvbuff, args->redOpArg, 0, args->connIndex, args->connIndex);
+
+#ifdef HYGON_SDMA_FEATURE
+    prims.ringIx = ring->index;
+    INIT_PRIMS_SDMA(prims, args);
+#endif
+    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+      ssize_t realChunkSize;
+      if (Proto::Id == NCCL_PROTO_SIMPLE) {
+        realChunkSize = min(chunkSize, divUp(size-gridOffset, nChannels));
+        realChunkSize = roundUp(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
+      }
+      else if (Proto::Id == NCCL_PROTO_LL)
+        realChunkSize = size-gridOffset < loopSize ? args->lastChunkSize : chunkSize;
+      else if (Proto::Id == NCCL_PROTO_LL128)
+        realChunkSize = min(divUp(size-gridOffset, nChannels*minChunkSizeLL128)*minChunkSizeLL128, chunkSize);
+      realChunkSize = int(realChunkSize);
+
+      ssize_t chunkOffset = gridOffset + bid*int(realChunkSize);
+
+      /////////////// begin ReduceScatter steps ///////////////
+      ssize_t offset;
+      int nelem = min(realChunkSize, size-chunkOffset);
+      int rankDest;
+
+      // step 0: push data to next GPU
+      rankDest = ringRanks[nranks-1];
+      offset = chunkOffset + rankDest * size;
+
+#if defined (ENABLE_TIMELINE)
+      elems = max(0, nelem);
+      clkStamp = __builtin_amdgcn_s_memrealtime();
+      Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp); 
+#endif
+      prims.send(offset, nelem);
+#if defined (ENABLE_TIMELINE)
+      totalElems += elems;
+      Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp); 
+#endif
+
+      // k-2 steps: reduce and copy to next GPU
+      for (int j=2; j<nranks; ++j) {
+        rankDest = ringRanks[nranks-j];
+        offset = chunkOffset + rankDest * size;
+
+#if defined (ENABLE_TIMELINE)
+        elems = max(0, nelem);
+        clkStamp = __builtin_amdgcn_s_memrealtime();
+        Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp); 
+#endif
+        prims.recvReduceSend(offset, nelem);
+#if defined (ENABLE_TIMELINE)
+        totalElems += elems;
+        Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_SEND_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp); 
+#endif
+      }
+
+      // step k-1: reduce this buffer and data, which will produce the final result
+      rankDest = ringRanks[0];
+      offset = chunkOffset + rankDest * size;
+        
+#if defined (ENABLE_TIMELINE)
+      elems = max(0, nelem);
+      clkStamp = __builtin_amdgcn_s_memrealtime();
+      Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_COPY_ENTRY, elems*sizeof(T), clkStamp, comm->cpuTimestamp); 
+#endif
+      prims.recvReduceCopy(offset, chunkOffset, nelem, /*postOp=*/true);
+#if defined (ENABLE_TIMELINE)
+      totalElems += elems;
+      Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_PRIM_RECV_REDUCE_COPY_EXIT, elems*sizeof(T), __builtin_amdgcn_s_memrealtime() - clkStamp, comm->cpuTimestamp); 
+#endif
+    }
+    
+#if defined (ENABLE_TIMELINE)
+    Timeline::CollectGpuPrimEvent(comm->gpuEventContext, TIMELINE_EVENT_REDUCE_SCATTER_EXIT, totalElems*sizeof(T), __builtin_amdgcn_s_memrealtime() - entryStamp, comm->cpuTimestamp); 
+#endif
+  }
+}
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    using Proto = ProtoSimple<REDUCESCATTER_CHUNKSTEPS/REDUCESCATTER_SLICESTEPS, REDUCESCATTER_SLICESTEPS>;
+    runRing<T, RedOp, Proto>(args);
+  }
+};
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL> {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    runRing<T, RedOp, ProtoLL>(args);
+  }
+};
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_LL128> {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    runRing<T, RedOp, ProtoLL128>(args);
+  }
+};
+
+template<typename T, typename RedOp>
+struct RunWorkElement<ncclFuncReduceScatter, T, RedOp, NCCL_ALGO_NVLS, NCCL_PROTO_SIMPLE> {
+  __device__ __forceinline__ void run(ncclWorkElem *args) {
+    const int tid = threadIdx.x;
+    const int bid = args->bid;
+    const int nChannels = args->nChannels;
+    struct ncclNvls* nvls = &ncclShmem.channel.nvls;
+    const ssize_t chunkSize = int(args->lastChunkSize);
+    const ssize_t size = args->count;
+    const ssize_t loopSize = nChannels*chunkSize;
+
+    const int nThreadsScatter = 128 + WARP_SIZE;
+    const int nThreadsReduce = 384;
+    const int tidEndScatter = nThreadsScatter;
+    const int tidEndReduce = tidEndScatter + nThreadsReduce;
+
+    using Proto = ProtoSimple<1, 1>;
+
+    if (tid < tidEndScatter) {
+      // Scatter
+      Primitives<T, RedOp, FanAsymmetric<0, NCCL_MAX_NVLS_ARITY>, /*Direct=*/0, Proto, 0>
+        prims(tid, nThreadsScatter, NULL, nvls->up, args->sendbuff, NULL,
+            args->redOpArg, 0*Proto::MaxGroupWidth, 0, 0);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        prims.scatter(offset, nvls->nHeads*size, nelem, size, -1, 0);
+      }
+    } else if (tid < tidEndReduce) {
+      // Reduce through NVLS
+      Primitives<T, RedOp, FanAsymmetric<1, 0>, /*Direct=*/0, Proto, 0>
+        prims(tid-tidEndScatter, nThreadsReduce, &nvls->down, NULL, NULL, args->recvbuff,
+           args->redOpArg, 3*Proto::MaxGroupWidth, 1, 1);
+      for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+        ssize_t offset = gridOffset + bid*chunkSize;
+        int nelem = min(chunkSize, size-offset);
+        prims.recv(offset, nelem);
+      }
+    }
+  }
+};
--- a/rccl/src/collectives/device/sendrecv.cu
+++ b/rccl/src/collectives/device/sendrecv.cu
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "sendrecv.h"
+#include "common.h"
+#include "collectives.h"
+
+IMPL_COLL_P(SendRecv);
--- a/rccl/src/collectives/device/sendrecv.h
+++ b/rccl/src/collectives/device/sendrecv.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "devcomm.h"
+#include "collectives.h"
+#include "primitives.h"
+#if defined(ENABLE_NPKIT)
+#include "npkit/npkit.h"
+#endif
+
+template<typename T, typename RedOp>
+struct RunWork<ncclFuncSendRecv, T, RedOp, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE> {
+  template<typename Proto>
+  __device__ void runSend(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
+    void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
+    ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
+
+#if defined(ENABLE_NPKIT)
+    bool isNpKitThread = (tid == 0);
+    int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + group;
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
+    if (isNpKitThread) {
+      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
+      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
+    if (isNpKitThread) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+    if (args->peer == ncclShmem.comm.rank) {
+      struct ncclWorkElemP2p* recvArgs = args-1;
+      void* recvBuff = reinterpret_cast<void*>(uintptr_t(recvArgs->buffHi32)<<32 | recvArgs->buffLo32);
+      if (buff != recvBuff) {
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY)
+        if (isNpKitThread) {
+          NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+        }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY)
+        if (isNpKitThread) {
+          NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+        }
+#endif
+
+        reduceCopy<COLL_UNROLL, RedOp, T, 0,1,1, 0,1,1, /*PreOpSrcs=*/0>
+          (tid, nthreads, 0, nullptr, false, 1, &buff, 1, &recvBuff, count);
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT)
+        if (isNpKitThread) {
+          NpKit::CollectGpuEvent(NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+        }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT)
+        if (isNpKitThread) {
+          NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+              ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+        }
+#endif
+      }
+    } else {
+      int chunkSize = args->chunkSize/sizeof(T);
+      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2;
+      int const peer = args->peer;
+      Primitives<T, RedOp, FanAsymmetric<0, 1>, 0, Proto, 1> prims
+        (tid, nthreads, nullptr, &peer, buff, nullptr, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex);
+
+#ifdef HYGON_SDMA_FEATURE
+      prims.ringIx = 0;
+      INIT_PRIMS_SDMA(prims, args);
+#endif
+#if defined(ENABLE_NPKIT)
+      if (isNpKitThread) {
+        prims.npKitCtxIdx = npKitCtxIdx;
+      }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY)
+      if (isNpKitThread) {
+        NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+        prims.npKitDataProcessTotalTime = 0;
+      }
+#endif
+
+      size_t offset = 0;
+      do {
+        int nelem = min(size_t(chunkSize), count-offset);
+        prims.directSend(offset, offset, nelem);
+        offset += nelem;
+      } while(offset < count);
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT)
+      if (isNpKitThread) {
+        NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_SEND_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+      }
+#endif
+
+#ifdef HYGON_SDMA_FEATURE
+      if (tid == 0 && prims.useSdmaCopy && prims.sdmaCountEnabe) {
+        *ncclShmem.channel.sdmaQueue.ptrSdmaCopyCount += prims.sdmaCopyCount;
+        *ncclShmem.channel.sdmaQueue.ptrAllCopyCount += prims.allCopyCount;
+        PRINT_DEBUG("runSend bid:%d sdmaCopyCount:%d allCopyCount:%d sumSdma:%d sumAll:%d\n", 
+          (int)blockIdx.x, prims.sdmaCopyCount, prims.allCopyCount,
+          *ncclShmem.channel.sdmaQueue.ptrSdmaCopyCount,
+          *ncclShmem.channel.sdmaQueue.ptrAllCopyCount);
+      }
+#endif
+    }
+  }
+
+  template<typename Proto>
+  __device__ void runRecv(const int tid, const int nthreads, const uint8_t group, struct ncclWorkElemP2p* args) {
+#if defined(ENABLE_NPKIT)
+    bool isNpKitThread = (tid == 0);
+    int npKitCtxIdx = blockIdx.x * NCCL_MAX_WORK_ELEMENTS_P2P + group;
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
+    if (isNpKitThread) {
+      uint64_t* cpuTimestamp = ncclShmem.comm.cpuTimestamp;
+      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
+    if (isNpKitThread) {
+      NpKit::CollectGpuEvent(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(),
+          ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+    }
+#endif
+
+    if (args->peer != ncclShmem.comm.rank) {
+      void* buff = reinterpret_cast<void*>(uintptr_t(args->buffHi32)<<32 | args->buffLo32);
+      ssize_t count = reinterpret_cast<size_t>(size_t(args->countHi32)<<32 | args->countLo32);
+      int chunkSize = args->chunkSize/sizeof(T);
+      if (args->proto == NCCL_PROTO_LL) chunkSize /= 2; // This is to account for chunkEffectiveSize
+      int const peer = args->peer;
+      Primitives<T, RedOp, FanAsymmetric<1, 0>, 0, Proto, 1> prims
+        (tid, nthreads, &peer, nullptr, nullptr, buff, /*redOpArg(ignored)=*/0, group, args->connIndex, args->connIndex);
+
+#ifdef HYGON_SDMA_FEATURE
+    prims.ringIx = 0;
+    prims.useSdmaCopy = 0;
+#endif
+#if defined(ENABLE_NPKIT)
+      if (isNpKitThread) {
+        prims.npKitCtxIdx = npKitCtxIdx;
+      }
+#endif
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY)
+      if (isNpKitThread) {
+        NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_ENTRY, count*sizeof(T), 0, NPKIT_GET_GPU_TIMESTAMP(),
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+        prims.npKitDataProcessTotalTime = 0;
+      }
+#endif
+
+      size_t offset = 0;
+      do {
+        int nelem = min(size_t(chunkSize), count-offset);
+        prims.directRecv(offset, nelem);
+        offset += nelem;
+      } while(offset < count);
+
+#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT)
+      if (isNpKitThread) {
+        NpKit::CollectGpuEvent(NPKIT_EVENT_SEND_RECV_RECV_EXIT, count*sizeof(T), prims.npKitDataProcessTotalTime, NPKIT_GET_GPU_TIMESTAMP(),
+            ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx);
+      }
+#endif
+
+    }
+  }
+
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+  __device__  void run(ncclWork *work) {
+#else
+  __device__  __attribute__((noinline)) void run(ncclWork *work) {
+#endif
+    struct ncclWorkElemP2p* args = work->p2pElems;
+    int ngroups = args->ngroups;
+    int tid = threadIdx.x;
+    int wid = tid / WARP_SIZE;
+    // This has to work even for groups of 2.5 warps (which is 8 groups, and means 3
+    // warps for send, 2 warps for recv).
+    // warpStarts were rounded thanks to int division, but for group number we need to round the other way around
+    // So we mirror wid then mirror again the group.
+    #define NWARPS (NCCL_MAX_NTHREADS/WARP_SIZE)
+    uint8_t group = ngroups-1- (NWARPS-1-wid) * ngroups / NWARPS;
+    args += group;
+    tid -= args->warpStart * WARP_SIZE;
+    int nthreads = args->nWarps * WARP_SIZE;
+
+    if (args->p2pType == ncclWorkP2pTypeUnused) return;
+    if (tid >= nthreads || args->peer == -1) return;
+
+    // Select Proto here
+    // This is to allow the same kernel to run multiple primitives on different warps (thread groups)
+    if ((group%2) == 0) {
+      if (args->proto == NCCL_PROTO_LL) {
+        runRecv<ProtoLL>(tid, nthreads, group, args);
+      } else {
+#if defined(__gfx90a__)
+        runRecv<ProtoSimple<1,1,8>>(tid, nthreads, group, args);
+#elif defined(__gfx908__)
+        runRecv<ProtoSimple<1,1,4>>(tid, nthreads, group, args);
+#else
+        runRecv<ProtoSimple<1,1>>(tid, nthreads, group, args);
+#endif
+      }
+    } else {
+      if (args->proto == NCCL_PROTO_LL) {
+        runSend<ProtoLL>(tid, nthreads, group, args);
+      } else {
+#if defined(__gfx90a__)
+        runSend<ProtoSimple<1,1,8>>(tid, nthreads, group, args);
+#elif defined(__gfx908__)
+        runSend<ProtoSimple<1,1,4>>(tid, nthreads, group, args);
+#else
+        runSend<ProtoSimple<1,1>>(tid, nthreads, group, args);
+#endif
+      }
+    }
+  }
+};
--- a/rccl/src/collectives/gather.cc
+++ b/rccl/src/collectives/gather.cc
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+#include "msccl/msccl_lifecycle.h"
+
+NCCL_API(ncclResult_t, ncclGather, const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream) {
+    if (mscclAvailable() && !mscclIsCaller()) {
+      return mscclEnqueueCheck(
+        sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
+        sendcount, datatype, root, 0, ncclSum, mscclFuncGather, comm, stream);
+    }
+
+    int nRanks;
+    NCCLCHECK(ncclCommCount(comm, &nRanks));
+    size_t rankOffset = sendcount * ncclTypeSize(datatype);
+    if (sendcount == 0) return ncclSuccess;
+    int rank;
+    NCCLCHECK(ncclCommUserRank(comm, &rank));
+    NCCLCHECK(ncclGroupStart());
+    if (rank == root) {
+      for (int r=0; r<nRanks; r++)
+        NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, sendcount, datatype, r, comm, stream));
+    }
+    NCCLCHECK(ncclSend(sendbuff, sendcount, datatype, root, comm, stream));
+    NCCLCHECK(ncclGroupEnd());
+    return ncclSuccess;
+}
--- a/rccl/src/collectives/msccl.cc
+++ b/rccl/src/collectives/msccl.cc
+/*************************************************************************
+ * Copyright (c) Microsoft Corporation.
+ * Licensed under the MIT License.
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "msccl/msccl_parser.h"
+#include "msccl/msccl_setup.h"
+#include "msccl/msccl_status.h"
+#include <cstdio>
+#include <cstdlib>
+
+NCCL_API(ncclResult_t, mscclLoadAlgo, const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank);
+ncclResult_t mscclLoadAlgo(const char *mscclAlgoFilePath, mscclAlgoHandle_t *mscclAlgoHandle, int rank) {
+  mscclStatus& status = mscclGetStatus();
+
+  if (status.freeAlgoHandles.size() == 0) {
+    WARN("MSCCL: MSCCL_MAX_NUM_ALGOS (%d) limit reached", MSCCL_MAX_NUM_ALGOS);
+    return ncclInvalidUsage;
+  }
+  *mscclAlgoHandle = *status.freeAlgoHandles.rbegin();
+  status.freeAlgoHandles.pop_back();
+
+  struct mscclAlgo* hostAlgo;
+  NCCLCHECK(ncclCalloc(&hostAlgo, 1));
+  NCCLCHECK(mscclGetAlgoFromXmlFile(mscclAlgoFilePath, hostAlgo, rank));
+  status.hostAlgos[*mscclAlgoHandle] = hostAlgo;
+
+  struct mscclAlgo* devAlgo;
+  NCCLCHECK(ncclCudaMalloc(&devAlgo, 1));
+  CUDACHECK(hipMemcpy(devAlgo, hostAlgo, sizeof(struct mscclAlgo), hipMemcpyHostToDevice));
+  status.devAlgos[*mscclAlgoHandle] = devAlgo;
+
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, mscclRunAlgo,
+    const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
+    void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
+    size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
+    mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream);
+ncclResult_t mscclRunAlgo(
+    const void* sendBuff, const size_t sendCounts[], const size_t sDisPls[],
+    void* recvBuff, const size_t recvCounts[], const size_t rDisPls[],
+    size_t count, ncclDataType_t dataType, int root, int peer, ncclRedOp_t op,
+    mscclAlgoHandle_t mscclAlgoHandle, ncclComm_t comm, hipStream_t stream) {
+  mscclStatus& status = mscclGetStatus();
+  struct mscclAlgo* hostAlgo = status.hostAlgos[mscclAlgoHandle];
+  struct mscclAlgo* devAlgo = status.devAlgos[mscclAlgoHandle];
+
+  NCCLCHECK(mscclGetCaptureStatus(stream));
+
+  NCCLCHECK(mscclSetupCount(hostAlgo, comm, count, dataType));
+
+  NCCLCHECK(mscclSetupScratch(hostAlgo, stream));
+
+  NCCLCHECK(mscclSetupSyncFlags(stream));
+
+  if (status.connectedAlgos[comm].find(mscclAlgoHandle) == status.connectedAlgos[comm].end()) {
+    hipStreamCaptureMode mode = hipStreamCaptureModeRelaxed;
+    CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    NCCLCHECK(mscclSetupConnections(hostAlgo, comm));
+    CUDACHECK(hipThreadExchangeStreamCaptureMode(&mode));
+    status.connectedAlgos[comm].insert(mscclAlgoHandle);
+  }
+
+  NCCLCHECK(mscclSetupProxy(hostAlgo, comm, stream));
+
+  NCCLCHECK(mscclSetupKernel(sendBuff, recvBuff, count, dataType, op, hostAlgo, devAlgo, comm, stream));
+
+  return ncclSuccess;
+}
+
+NCCL_API(ncclResult_t, mscclUnloadAlgo, mscclAlgoHandle_t mscclAlgoHandle);
+ncclResult_t mscclUnloadAlgo(mscclAlgoHandle_t mscclAlgoHandle) {
+  mscclStatus& status = mscclGetStatus();
+
+  free(status.hostAlgos[mscclAlgoHandle]);
+  status.hostAlgos.erase(mscclAlgoHandle);
+
+  NCCLCHECK(ncclCudaFree(status.devAlgos[mscclAlgoHandle]));
+  status.devAlgos.erase(mscclAlgoHandle);
+
+  status.freeAlgoHandles.push_back(mscclAlgoHandle);
+
+  for (auto &s : status.connectedAlgos) {
+    s.second.erase(mscclAlgoHandle);
+  }
+
+  return ncclSuccess;
+}
--- a/rccl/src/collectives/reduce.cc
+++ b/rccl/src/collectives/reduce.cc
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+#include "nccl.h"
+
+#include "msccl/msccl_lifecycle.h"
+#include "hipprof/hip_prof_rccl_api.h"
+
+NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) {
+  struct NvtxParamsReduce {
+    size_t bytes;
+    int root;
+    ncclRedOp_t op;
+  };
+  constexpr nvtxPayloadSchemaEntry_t ReduceSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Root", nullptr, 0, offsetof(NvtxParamsReduce, root)},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsReduce, op)}
+  };
+  NvtxParamsReduce payload{count * ncclTypeSize(datatype), root, op};
+  NVTX3_FUNC_WITH_PARAMS(Reduce, ReduceSchema, payload)
+
+  if (mscclAvailable() && !mscclIsCaller()) {
+    return mscclEnqueueCheck(
+      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
+      count, datatype, root, 0, op, mscclFuncReduce, comm, stream);
+  }
+
+  struct ncclInfo info = { ncclFuncReduce, "Reduce",
+    sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */
+    REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS };
+
+  RCCL_CB_SPAWNER_OBJECT(ncclReduce, &info);
+
+  return ncclEnqueueCheck(&info);
+}
--- a/rccl/src/collectives/reduce_scatter.cc
+++ b/rccl/src/collectives/reduce_scatter.cc
+/*************************************************************************
+ * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+#include "nccl.h"
+
+#include "msccl/msccl_lifecycle.h"
+#include "hipprof/hip_prof_rccl_api.h"
+
+NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream);
+ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) {
+  struct NvtxParamsReduceScatter {
+    size_t bytes;
+    ncclRedOp_t op;
+  };
+  constexpr nvtxPayloadSchemaEntry_t ReduceScatterSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Message size [bytes]"},
+    {0, NVTX_PAYLOAD_ENTRY_NCCL_REDOP, "Reduction operation", nullptr, 0,
+      offsetof(NvtxParamsReduceScatter, op)}
+  };
+  NvtxParamsReduceScatter payload{recvcount * ncclTypeSize(datatype), op};
+  NVTX3_FUNC_WITH_PARAMS(ReduceScatter, ReduceScatterSchema, payload)
+
+  if (mscclAvailable() && !mscclIsCaller()) {
+    return mscclEnqueueCheck(
+      sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
+      recvcount, datatype, 0, 0, op, mscclFuncReduceScatter, comm, stream);
+  }
+
+  struct ncclInfo info = { ncclFuncReduceScatter, "ReduceScatter",
+    sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */
+    REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS };
+
+  RCCL_CB_SPAWNER_OBJECT(ncclReduceScatter, &info);
+
+  return ncclEnqueueCheck(&info);
+}
--- a/rccl/src/collectives/scatter.cc
+++ b/rccl/src/collectives/scatter.cc
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+
+#include "msccl/msccl_lifecycle.h"
+
+NCCL_API(ncclResult_t, ncclScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream);
+ncclResult_t ncclScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream) {
+    if (mscclAvailable() && !mscclIsCaller()) {
+      return mscclEnqueueCheck(
+        sendbuff, nullptr, nullptr, recvbuff, nullptr, nullptr,
+        recvcount, datatype, root, 0, ncclSum, mscclFuncScatter, comm, stream);
+    }
+
+    int nRanks;
+    NCCLCHECK(ncclCommCount(comm, &nRanks));
+    size_t rankOffset = recvcount * ncclTypeSize(datatype);
+    if (recvcount == 0) return ncclSuccess;
+    int rank;
+    NCCLCHECK(ncclCommUserRank(comm, &rank));
+    NCCLCHECK(ncclGroupStart());
+    if (rank == root) {
+      for (int r=0; r<nRanks; r++)
+        NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, recvcount, datatype, r, comm, stream));
+    }
+    NCCLCHECK(ncclRecv(recvbuff, recvcount, datatype, root, comm, stream));
+    NCCLCHECK(ncclGroupEnd());
+    return ncclSuccess;
+}
--- a/rccl/src/collectives/sendrecv.cc
+++ b/rccl/src/collectives/sendrecv.cc
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "enqueue.h"
+#include "collectives.h"
+#include "argcheck.h" // Need some checks here since we access comm
+
+#include "msccl/msccl_lifecycle.h"
+#include "hipprof/hip_prof_rccl_api.h"
+
+struct NvtxParamsSendRecv {
+    size_t bytes;
+    int peer;
+};
+constexpr const nvtxPayloadSchemaEntry_t SendRecvSchema[] = {
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_SIZE, "Bytes"},
+    {0, NVTX_PAYLOAD_ENTRY_TYPE_INT, "Peer rank", nullptr, 0, offsetof(NvtxParamsSendRecv, peer)}
+};
+
+NCCL_API(ncclResult_t, ncclSend, const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream) {
+  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
+  NVTX3_FUNC_WITH_PARAMS(Send, SendRecvSchema, payload)
+
+  if (mscclAvailable() && !mscclIsCaller()) {
+    return mscclEnqueueCheck(
+      sendbuff, nullptr, nullptr, nullptr, nullptr, nullptr,
+      count, datatype, 0, peer, ncclSum, mscclFuncSend, comm, stream);
+  }
+
+  struct ncclInfo info = { ncclFuncSend, "Send",
+    NULL, (void*)sendbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+    1, 1 };
+
+  RCCL_CB_SPAWNER_OBJECT(ncclSend, &info);
+
+  ncclResult_t ret;
+  NCCLCHECK(ncclGroupStart());
+  ret = ncclEnqueueCheck(&info);
+  NCCLCHECK(ncclGroupEnd());
+  return ret;
+}
+
+NCCL_API(ncclResult_t, ncclRecv, void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream) {
+  NvtxParamsSendRecv payload{count * ncclTypeSize(datatype), peer};
+  NVTX3_FUNC_WITH_PARAMS(Recv, SendRecvSchema, payload)
+
+  if (mscclAvailable() && !mscclIsCaller()) {
+    return mscclEnqueueCheck(
+      nullptr, nullptr, nullptr, recvbuff, nullptr, nullptr,
+      count, datatype, 0, peer, ncclSum, mscclFuncRecv, comm, stream);
+  }
+
+  struct ncclInfo info = { ncclFuncRecv, "Recv",
+    NULL, recvbuff, count, datatype, ncclSum, peer, comm, stream, /* Args */
+    1, 1 };
+  
+  RCCL_CB_SPAWNER_OBJECT(ncclRecv, &info);
+  
+  ncclResult_t ret;
+  NCCLCHECK(ncclGroupStart());
+  ret = ncclEnqueueCheck(&info);
+  NCCLCHECK(ncclGroupEnd());
+  return ret;
+}
--- a/rccl/src/debug.cc
+++ b/rccl/src/debug.cc
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "core.h"
+#include "nccl_net.h"
+#include <stdlib.h>
+#include <stdarg.h>
+#include <sys/syscall.h>
+
+int ncclDebugLevel = -1;
+static int pid = -1;
+static char hostname[1024];
+thread_local int ncclDebugNoWarn = 0;
+char ncclLastError[1024] = ""; // Global string for the last error in human readable form
+uint64_t ncclDebugMask = NCCL_INIT|NCCL_ENV; // Default debug sub-system mask is INIT and ENV
+FILE *ncclDebugFile = stdout;
+pthread_mutex_t ncclDebugLock = PTHREAD_MUTEX_INITIALIZER;
+std::chrono::steady_clock::time_point ncclEpoch;
+
+static __thread int tid = -1;
+
+void ncclDebugInit() {
+  pthread_mutex_lock(&ncclDebugLock);
+  if (ncclDebugLevel != -1) { pthread_mutex_unlock(&ncclDebugLock); return; }
+  const char* nccl_debug = getenv("NCCL_DEBUG");
+  int tempNcclDebugLevel = -1;
+  if (nccl_debug == NULL) {
+    tempNcclDebugLevel = NCCL_LOG_NONE;
+  } else if (strcasecmp(nccl_debug, "VERSION") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_VERSION;
+  } else if (strcasecmp(nccl_debug, "WARN") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_WARN;
+  } else if (strcasecmp(nccl_debug, "INFO") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_INFO;
+  } else if (strcasecmp(nccl_debug, "ABORT") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_ABORT;
+  } else if (strcasecmp(nccl_debug, "TRACE") == 0) {
+    tempNcclDebugLevel = NCCL_LOG_TRACE;
+  }
+
+  /* Parse the NCCL_DEBUG_SUBSYS env var
+   * This can be a comma separated list such as INIT,COLL
+   * or ^INIT,COLL etc
+   */
+  char* ncclDebugSubsysEnv = getenv("NCCL_DEBUG_SUBSYS");
+  if (ncclDebugSubsysEnv != NULL) {
+    int invert = 0;
+    if (ncclDebugSubsysEnv[0] == '^') { invert = 1; ncclDebugSubsysEnv++; }
+    ncclDebugMask = invert ? ~0ULL : 0ULL;
+    char *ncclDebugSubsys = strdup(ncclDebugSubsysEnv);
+    char *subsys = strtok(ncclDebugSubsys, ",");
+    while (subsys != NULL) {
+      uint64_t mask = 0;
+      if (strcasecmp(subsys, "INIT") == 0) {
+        mask = NCCL_INIT;
+      } else if (strcasecmp(subsys, "COLL") == 0) {
+        mask = NCCL_COLL;
+      } else if (strcasecmp(subsys, "P2P") == 0) {
+        mask = NCCL_P2P;
+      } else if (strcasecmp(subsys, "SHM") == 0) {
+        mask = NCCL_SHM;
+      } else if (strcasecmp(subsys, "NET") == 0) {
+        mask = NCCL_NET;
+      } else if (strcasecmp(subsys, "GRAPH") == 0) {
+        mask = NCCL_GRAPH;
+      } else if (strcasecmp(subsys, "TUNING") == 0) {
+        mask = NCCL_TUNING;
+      } else if (strcasecmp(subsys, "ENV") == 0) {
+        mask = NCCL_ENV;
+      } else if (strcasecmp(subsys, "ALLOC") == 0) {
+        mask = NCCL_ALLOC;
+      } else if (strcasecmp(subsys, "CALL") == 0) {
+        mask = NCCL_CALL;
+      } else if (strcasecmp(subsys, "PROXY") == 0) {
+        mask = NCCL_PROXY;
+      } else if (strcasecmp(subsys, "NVLS") == 0) {
+        mask = NCCL_NVLS;
+      } else if (strcasecmp(subsys, "ALL") == 0) {
+        mask = NCCL_ALL;
+      }
+      if (mask) {
+        if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask;
+      }
+      subsys = strtok(NULL, ",");
+    }
+    free(ncclDebugSubsys);
+  }
+
+  // Cache pid and hostname
+  getHostName(hostname, 1024, '.');
+  pid = getpid();
+
+  /* Parse and expand the NCCL_DEBUG_FILE path and
+   * then create the debug file. But don't bother unless the
+   * NCCL_DEBUG level is > VERSION
+   */
+  const char* ncclDebugFileEnv = getenv("NCCL_DEBUG_FILE");
+  if (tempNcclDebugLevel > NCCL_LOG_VERSION && ncclDebugFileEnv != NULL) {
+    int c = 0;
+    char debugFn[PATH_MAX+1] = "";
+    char *dfn = debugFn;
+    while (ncclDebugFileEnv[c] != '\0' && c < PATH_MAX) {
+      if (ncclDebugFileEnv[c++] != '%') {
+        *dfn++ = ncclDebugFileEnv[c-1];
+        continue;
+      }
+      switch (ncclDebugFileEnv[c++]) {
+        case '%': // Double %
+          *dfn++ = '%';
+          break;
+        case 'h': // %h = hostname
+          dfn += snprintf(dfn, PATH_MAX, "%s", hostname);
+          break;
+        case 'p': // %p = pid
+          dfn += snprintf(dfn, PATH_MAX, "%d", pid);
+          break;
+        default: // Echo everything we don't understand
+          *dfn++ = '%';
+          *dfn++ = ncclDebugFileEnv[c-1];
+          break;
+      }
+    }
+    *dfn = '\0';
+    if (debugFn[0] != '\0') {
+      FILE *file = fopen(debugFn, "w");
+      if (file != nullptr) {
+        setbuf(file, nullptr); // disable buffering
+        ncclDebugFile = file;
+      }
+    }
+  }
+
+  ncclEpoch = std::chrono::steady_clock::now();
+  __atomic_store_n(&ncclDebugLevel, tempNcclDebugLevel, __ATOMIC_RELEASE);
+  pthread_mutex_unlock(&ncclDebugLock);
+}
+
+/* Common logging function used by the INFO, WARN and TRACE macros
+ * Also exported to the dynamically loadable Net transport modules so
+ * they can share the debugging mechanisms and output files
+ */
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) {
+  if (__atomic_load_n(&ncclDebugLevel, __ATOMIC_ACQUIRE) == -1) ncclDebugInit();
+  if (ncclDebugNoWarn != 0 && level == NCCL_LOG_WARN) { level = NCCL_LOG_INFO; flags = ncclDebugNoWarn; }
+
+  // Save the last error (WARN) as a human readable string
+  if (level == NCCL_LOG_WARN) {
+    pthread_mutex_lock(&ncclDebugLock);
+    va_list vargs;
+    va_start(vargs, fmt);
+    (void) vsnprintf(ncclLastError, sizeof(ncclLastError), fmt, vargs);
+    va_end(vargs);
+    pthread_mutex_unlock(&ncclDebugLock);
+  }
+  if (ncclDebugLevel < level || ((flags & ncclDebugMask) == 0)) return;
+
+  if (tid == -1) {
+    tid = syscall(SYS_gettid);
+  }
+
+  int cudaDev;
+  if (!(level == NCCL_LOG_TRACE && flags == NCCL_CALL)) {
+    cudaGetDevice(&cudaDev);
+  }
+
+  char buffer[1024];
+  size_t len = 0;
+  if (level == NCCL_LOG_WARN) {
+    len = snprintf(buffer, sizeof(buffer), "\n%s:%d:%d [%d] %s:%d NCCL WARN ",
+                   hostname, pid, tid, cudaDev, filefunc, line);
+  } else if (level == NCCL_LOG_INFO) {
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] NCCL INFO ", hostname, pid, tid, cudaDev);
+  } else if (level == NCCL_LOG_TRACE && flags == NCCL_CALL) {
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d NCCL CALL ", hostname, pid, tid);
+  } else if (level == NCCL_LOG_TRACE) {
+    auto delta = std::chrono::steady_clock::now() - ncclEpoch;
+    double timestamp = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count()*1000;
+    len = snprintf(buffer, sizeof(buffer), "%s:%d:%d [%d] %f %s:%d NCCL TRACE ",
+                   hostname, pid, tid, cudaDev, timestamp, filefunc, line);
+  }
+
+  if (len) {
+    va_list vargs;
+    va_start(vargs, fmt);
+    len += vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs);
+    va_end(vargs);
+    buffer[len++] = '\n';
+    fwrite(buffer, 1, len, ncclDebugFile);
+  }
+}
+
+NCCL_PARAM(SetThreadName, "SET_THREAD_NAME", 0);
+
+void ncclSetThreadName(pthread_t thread, const char *fmt, ...) {
+  // pthread_setname_np is nonstandard GNU extension
+  // needs the following feature test macro
+#ifdef _GNU_SOURCE
+  if (ncclParamSetThreadName() != 1) return;
+  char threadName[NCCL_THREAD_NAMELEN];
+  va_list vargs;
+  va_start(vargs, fmt);
+  vsnprintf(threadName, NCCL_THREAD_NAMELEN, fmt, vargs);
+  va_end(vargs);
+  pthread_setname_np(thread, threadName);
+#endif
+}
--- a/rccl/src/enhcompat.cc
+++ b/rccl/src/enhcompat.cc
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/* Define weak symbols used to allow libnccl_static.a to work with older libcudart_static.a */
+
+enum cudaError_t { cudaErrorStubLibrary = 34 };
+
+extern "C" {
+
+cudaError_t cudaStreamGetCaptureInfo_v2(...)         __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaStreamGetCaptureInfo_v2(...)         { return cudaErrorStubLibrary; }
+
+cudaError_t cudaUserObjectCreate(...)                __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaUserObjectCreate(...)                { return cudaErrorStubLibrary; }
+
+cudaError_t cudaGraphRetainUserObject(...)           __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaGraphRetainUserObject(...)           { return cudaErrorStubLibrary; }
+
+cudaError_t cudaStreamUpdateCaptureDependencies(...) __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaStreamUpdateCaptureDependencies(...) { return cudaErrorStubLibrary; }
+
+cudaError_t cudaGetDriverEntryPoint(...)             __attribute__((visibility("hidden"))) __attribute((weak));
+cudaError_t cudaGetDriverEntryPoint(...)             { return cudaErrorStubLibrary; }
+
+}