Initial commit: RCCL auto-tuning project

7dc4e964 · wanghan · 7dc4e964 · 7dc4e964 · 7dc4e964 · 7dc4e964
Commit 7dc4e964 authored Apr 02, 2026 by wanghan
20 changed files
--- a/rccl/src/include/bootstrap.h
+++ b/rccl/src/include/bootstrap.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_BOOTSTRAP_H_
+#define NCCL_BOOTSTRAP_H_
+#include "nccl.h"
+#include "comm.h"
+struct ncclBootstrapHandle {
+  uint64_t magic;
+  union ncclSocketAddress addr;
+};
+static_assert(sizeof(struct ncclBootstrapHandle) <= sizeof(ncclUniqueId), "Bootstrap handle is too large to fit inside NCCL unique ID");
+ncclResult_t bootstrapNetInit();
+ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFromEnv);
+ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle);
+ncclResult_t bootstrapInit(struct ncclBootstrapHandle* handle, struct ncclComm* comm);
+ncclResult_t bootstrapSplit(struct ncclBootstrapHandle* handle, struct ncclComm* comm, struct ncclComm* parent, int color, int key, int* parentRanks);
+ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
+ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size);
+ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size);
+ncclResult_t bootstrapBarrier(void* commState, int *ranks, int rank, int nranks, int tag);
+ncclResult_t bootstrapIntraNodeAllGather(void* commState, int *ranks, int rank, int nranks, void* allData, int size);
+ncclResult_t bootstrapIntraNodeBroadcast(void* commState, int *ranks, int rank, int nranks, int root, void* bcastData, int size);
+ncclResult_t bootstrapClose(void* commState);
+ncclResult_t bootstrapAbort(void* commState);
+#endif
--- a/rccl/src/include/channel.h
+++ b/rccl/src/include/channel.h
+/*************************************************************************
+ * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_CHANNEL_H_
+#define NCCL_CHANNEL_H_
+#include "comm.h"
+ncclResult_t initChannel(struct ncclComm* comm, int channelid);
+ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
+ncclResult_t initCollnetChannel(struct ncclComm* comm, int channelId, struct ncclComm* parent, bool share);
+ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks, int collnetNRanks, int nvlsNRanks);
+static ncclResult_t ncclChannelComputeBase(struct ncclComm* comm, int peer, int coll, int*channelBase) {
+  int p2pGroupSize = NCCL_MAX_WORK_ELEMENTS_P2P/2;
+  int peerNode = comm->rankToNode[peer];
+  int peerIndex = comm->rankToLocalRank[peer];
+  int nsteps = comm->maxLocalRanks;
+  int rankIndex = comm->rankToLocalRank[comm->rank];
+  int step, delta;
+  if (coll == ncclFuncSend) {
+    step = (nsteps + peerIndex - rankIndex)%nsteps;
+    delta = (comm->nNodes + peerNode - comm->node) % comm->nNodes;
+  } else if (coll == ncclFuncRecv) {
+    step = (nsteps + rankIndex - peerIndex)%nsteps;
+    delta = (comm->nNodes + comm->node - peerNode) % comm->nNodes;
+  } else {
+    return ncclInternalError;
+  }
+  *channelBase = comm->nNodes > 1 ? delta+(step/p2pGroupSize) : step;
+  return ncclSuccess;
+}
+static ncclResult_t ncclChannelComputeFromBase(struct ncclComm* comm, int base, int channelInc, int*channelId) {
+  //*channelId = (base+comm->p2pChannels[channelInc]) % comm->p2pnChannels;
+  *channelId = (comm->p2pChannels[base%comm->p2pnChannels]+channelInc) % comm->p2pnChannels;
+  return ncclSuccess;
+}
+static ncclResult_t ncclChannelCompute(struct ncclComm* comm, int peer, int channelInc, int coll, int*channelId) {
+  int base;
+  NCCLCHECK(ncclChannelComputeBase(comm, peer, coll, &base));
+  NCCLCHECK(ncclChannelComputeFromBase(comm, base, channelInc, channelId));
+  return ncclSuccess;
+}
+#endif
--- a/rccl/src/include/checks.h
+++ b/rccl/src/include/checks.h
+/*************************************************************************
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_CHECKS_H_
+#define NCCL_CHECKS_H_
+#include "debug.h"
+// Check CUDA RT calls
+#define CUDACHECK(cmd) do {                                 \
+    cudaError_t err = cmd;                                  \
+    if( err != cudaSuccess ) {                              \
+        WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
+        return ncclUnhandledCudaError;                      \
+    }                                                       \
+} while(false)
+#define CUDACHECKGOTO(cmd, RES, label) do {                 \
+    cudaError_t err = cmd;                                  \
+    if( err != cudaSuccess ) {                              \
+        WARN("Cuda failure '%s'", cudaGetErrorString(err)); \
+        RES = ncclUnhandledCudaError;                       \
+        goto label;                                         \
+    }                                                       \
+} while(false)
+// Report failure but clear error and continue
+#define CUDACHECKIGNORE(cmd) do {  \
+    cudaError_t err = cmd;         \
+    if( err != cudaSuccess ) {     \
+        INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, cudaGetErrorString(err)); \
+        (void) cudaGetLastError(); \
+    }                              \
+} while(false)
+#include <errno.h>
+// Check system calls
+#define SYSCHECK(call, name) do { \
+  int retval; \
+  SYSCHECKVAL(call, name, retval); \
+} while (false)
+#define SYSCHECKVAL(call, name, retval) do { \
+  SYSCHECKSYNC(call, name, retval); \
+  if (retval == -1) { \
+    WARN("Call to " name " failed : %s", strerror(errno)); \
+    return ncclSystemError; \
+  } \
+} while (false)
+#define SYSCHECKSYNC(call, name, retval) do { \
+  retval = call; \
+  if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \
+    INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \
+  } else { \
+    break; \
+  } \
+} while(true)
+#define SYSCHECKGOTO(statement, RES, label) do { \
+  if ((statement) == -1) {    \
+    /* Print the back trace*/ \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
+    goto label; \
+  } \
+} while (0);
+#define NEQCHECK(statement, value) do {   \
+  if ((statement) != value) {             \
+    /* Print the back trace*/             \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
+    return ncclSystemError;     \
+  }                             \
+} while (0);
+#define NEQCHECKGOTO(statement, value, RES, label) do { \
+  if ((statement) != value) { \
+    /* Print the back trace*/ \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
+    goto label; \
+  } \
+} while (0);
+#define EQCHECK(statement, value) do {    \
+  if ((statement) == value) {             \
+    /* Print the back trace*/             \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, ncclSystemError, strerror(errno));    \
+    return ncclSystemError;     \
+  }                             \
+} while (0);
+#define EQCHECKGOTO(statement, value, RES, label) do { \
+  if ((statement) == value) { \
+    /* Print the back trace*/ \
+    RES = ncclSystemError;    \
+    INFO(NCCL_ALL,"%s:%d -> %d (%s)", __FILE__, __LINE__, RES, strerror(errno));    \
+    goto label; \
+  } \
+} while (0);
+// Propagate errors up
+#define NCCLCHECK(call) do { \
+  ncclResult_t RES = call; \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
+    /* Print the back trace*/ \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    return RES; \
+  } \
+} while (0);
+#define NCCLCHECKGOTO(call, RES, label) do { \
+  RES = call; \
+  if (RES != ncclSuccess && RES != ncclInProgress) { \
+    /* Print the back trace*/ \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    goto label; \
+  } \
+} while (0);
+#define NCCLWAIT(call, cond, abortFlagPtr) do {         \
+  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);     \
+  ncclResult_t RES = call;                \
+  if (RES != ncclSuccess && RES != ncclInProgress) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    return ncclInternalError;             \
+  }                                       \
+  if (tmpAbortFlag) NEQCHECK(*tmpAbortFlag, 0); \
+} while (!(cond));
+#define NCCLWAITGOTO(call, cond, abortFlagPtr, RES, label) do { \
+  volatile uint32_t* tmpAbortFlag = (abortFlagPtr);             \
+  RES = call;                             \
+  if (RES != ncclSuccess && RES != ncclInProgress) {               \
+    if (ncclDebugNoWarn == 0) INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, RES);    \
+    goto label;                           \
+  }                                       \
+  if (tmpAbortFlag) NEQCHECKGOTO(*tmpAbortFlag, 0, RES, label); \
+} while (!(cond));
+#define NCCLCHECKTHREAD(a, args) do { \
+  if (((args)->ret = (a)) != ncclSuccess && (args)->ret != ncclInProgress) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, (args)->ret); \
+    return args; \
+  } \
+} while(0)
+#define CUDACHECKTHREAD(a) do { \
+  if ((a) != cudaSuccess) { \
+    INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \
+    args->ret = ncclUnhandledCudaError; \
+    return args; \
+  } \
+} while(0)
+#endif
--- a/rccl/src/include/coll_net.h
+++ b/rccl/src/include/coll_net.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef COLL_NET_H_
+#define COLL_NET_H_
+#include "nccl.h"
+#include "nccl_net.h"
+typedef char collNetHandle_t[NCCL_NET_HANDLE_MAXSIZE];
+// Translation to external API
+static const char* collNetName(struct ncclComm* comm) { return comm->ncclCollNet->name; }
+static ncclResult_t collNetDevices(struct ncclComm* comm, int* ndev) { NCCLCHECK(comm->ncclCollNet->devices(ndev)); return ncclSuccess; }
+static ncclResult_t collNetGetProperties(struct ncclComm* comm, int dev, ncclNetProperties_t* props) { NCCLCHECK(comm->ncclCollNet->getProperties(dev, props)); return ncclSuccess; }
+static ncclResult_t collNetListen(struct ncclComm* comm, int dev, void* handle, void** listenComm) { NCCLCHECK(comm->ncclCollNet->listen(dev, handle, listenComm)); return ncclSuccess; }
+static ncclResult_t collNetConnect(struct ncclComm* comm, void* handles[], int nranks, int rank, void* listenComm, void** collComm) { NCCLCHECK(comm->ncclCollNet->connect(handles, nranks, rank, listenComm, collComm)); return ncclSuccess; }
+static ncclResult_t collNetReduceSupport(struct ncclComm* comm, ncclDataType_t dataType, ncclRedOp_t redOp, int* supported) { NCCLCHECK(comm->ncclCollNet->reduceSupport(dataType, redOp, supported)); return ncclSuccess; }
+static ncclResult_t collNetRegMr(struct ncclComm* comm, void* collComm, void* data, int size, int type, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMr(collComm, data, size, type, mhandle)); return ncclSuccess; }
+/* DMA-BUF support */
+static ncclResult_t collNetRegMrDmaBuf(struct ncclComm* comm, void* collComm, void* data, int size, int type, uint64_t offset, int fd, void** mhandle) { NCCLCHECK(comm->ncclCollNet->regMrDmaBuf(collComm, data, size, type, offset, fd, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetDeregMr(struct ncclComm* comm, void* collComm, void* mhandle) { NCCLCHECK(comm->ncclCollNet->deregMr(collComm, mhandle)); return ncclSuccess; }
+static ncclResult_t collNetIallreduce(struct ncclComm* comm, void* collComm, void* sendData, void* recvData, int count, ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle,  void** request) {
+  NCCLCHECK(comm->ncclCollNet->iallreduce(collComm, sendData, recvData, count, dataType, redOp, sendMhandle, recvMhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetIflush(struct ncclComm* comm, void* collComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(comm->ncclCollNet->iflush(collComm, data, size, mhandle, request)); return ncclSuccess; }
+static ncclResult_t collNetTest(struct ncclComm* comm, void* request, int* done, int* size) { NCCLCHECK(comm->ncclCollNet->test(request, done, size)); return ncclSuccess; }
+static ncclResult_t collNetCloseColl(struct ncclComm* comm, void* collComm) { NCCLCHECK(comm->ncclCollNet->closeColl(collComm)); return ncclSuccess; }
+static ncclResult_t collNetCloseListen(struct ncclComm* comm, void* listenComm) { NCCLCHECK(comm->ncclCollNet->closeListen(listenComm)); return ncclSuccess; }
+static int collNetSupport(struct ncclComm* comm) { return comm->ncclCollNet != nullptr ? 1 : 0; }
+#endif
--- a/rccl/src/include/collectives.h
+++ b/rccl/src/include/collectives.h
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_COLLECTIVES_H_
+#define NCCL_COLLECTIVES_H_
+enum ncclDevRedOp_t {
+  ncclDevSum, ncclDevProd, ncclDevMax, ncclDevMin,
+  ncclDevPreMulSum, ncclDevSumPostDiv,
+  ncclNumDevRedOps
+};
+struct ncclDevRedOpFull {
+  ncclDevRedOp_t op;
+  bool scalarArgIsPtr;
+  uint64_t scalarArg;
+};
+#define FUNC_INDEX_P2P (ncclNumTypes+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumDevRedOps)
+#define FUNC_INDEX_ALLTOALL_PIVOT (FUNC_INDEX_P2P+1)
+#define FUNC_INDEX(func, devredop, ncclType, al, pr) ((((((func)*ncclNumDevRedOps + (devredop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
+#define NCCL_FUNC_NAME(func, algo, proto, devredop, type) \
+  ncclFunction_##func##_##algo##_##proto##_##devredop##_##type
+#define NCCL_ONERANK_REDUCE_NAME(devredop, type) \
+  ncclFunction_OneRankReduce_##devredop##_##type
+#define NCCL_KERN_NAME(func, algo, proto, devredop, type) \
+  ncclKernel_##func##_##algo##_##proto##_##devredop##_##type
+#define NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type) \
+  ncclKernelDebug_##func##_##algo##_##proto##_##devredop##_##type
+#define NCCL_IMPL_NAME(func, algo, proto) \
+  nccl##func##algo##proto
+/* Declare all collective operations */
+#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
+#define DECL5(func, algo, proto, devredop, type) \
+  extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
+  extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
+#else
+#define DECL5(func, algo, proto, devredop, type) \
+  extern __device__ __attribute__((noinline)) void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \
+  extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \
+  extern __global__ void NCCL_KERN_NAME_DEBUG(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead);
+#endif
+#define SINGLE_ARG(...) __VA_ARGS__
+#define CONCAT(a,b) a##b
+#define MACRO_IF(cond, t, f) CONCAT(MACRO_IF_, cond)(SINGLE_ARG(t), SINGLE_ARG(f))
+#define MACRO_IF_0(t, f) f
+#define MACRO_IF_1(t, f) t
+#define DECL4(func, algo, devredop, type, undef) \
+  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, SIMPLE, devredop, type)) \
+  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL,     devredop, type)) \
+  MACRO_IF(undef, /*undefined*/, DECL5(func, algo, LL128,  devredop, type))
+#define DECL3(func, devredop, type, undef) \
+  DECL4(func, RING,           devredop, type, undef) \
+  DECL4(func, TREE,           devredop, type, undef) \
+  DECL4(func, COLLNET_DIRECT, devredop, type, undef) \
+  DECL4(func, COLLNET_CHAIN,  devredop, type, undef) \
+  DECL4(func, NVLS,           devredop, type, undef) \
+  DECL4(func, NVLS_TREE,      devredop, type, undef)
+#if defined(RCCL_BFLOAT16)
+#define DECL2(func, devredop, undefForFloat) \
+  DECL3(func, devredop, int8_t, /*undef=*/0) \
+  DECL3(func, devredop, uint8_t, /*undef=*/0) \
+  DECL3(func, devredop, int32_t, /*undef=*/0) \
+  DECL3(func, devredop, uint32_t, /*undef=*/0) \
+  DECL3(func, devredop, int64_t, /*undef=*/0) \
+  DECL3(func, devredop, uint64_t, /*undef=*/0) \
+  DECL3(func, devredop, half, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, float, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, double, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, rccl_bfloat16, /*undef=*/undefForFloat)
+#else
+#define DECL2(func, devredop, undefForFloat) \
+  DECL3(func, devredop, int8_t, /*undef=*/0) \
+  DECL3(func, devredop, uint8_t, /*undef=*/0) \
+  DECL3(func, devredop, int32_t, /*undef=*/0) \
+  DECL3(func, devredop, uint32_t, /*undef=*/0) \
+  DECL3(func, devredop, int64_t, /*undef=*/0) \
+  DECL3(func, devredop, uint64_t, /*undef=*/0) \
+  DECL3(func, devredop, half, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, float, /*undef=*/undefForFloat) \
+  DECL3(func, devredop, double, /*undef=*/undefForFloat)
+#endif
+#define DECL(func) \
+  DECL2(func, Sum, /*undefForFloat=*/0) \
+  DECL2(func, Prod, /*undefForFloat=*/0) \
+  DECL2(func, Min, /*undefForFloat=*/0) \
+  DECL2(func, Max, /*undefForFloat=*/0) \
+  DECL2(func, PreMulSum, /*undefForFloat=*/0) \
+  DECL2(func, SumPostDiv, /*undefForFloat=*/1)
+DECL2(Broadcast, Sum, /*undefForFloat=*/0)
+DECL(Reduce)
+DECL2(AllGather, Sum, /*undefForFloat=*/0)
+DECL(ReduceScatter)
+DECL(AllReduce)
+DECL5(SendRecv, RING, SIMPLE, Sum, int8_t)
+DECL5(AllToAllPivot, RING, SIMPLE, Sum, int8_t)
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int8_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint8_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int32_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint32_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, int64_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, uint64_t)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, half)();
+#if defined(RCCL_BFLOAT16)
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, rccl_bfloat16)();
+#endif
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, float)();
+extern __device__ void NCCL_ONERANK_REDUCE_NAME(PreMulSum, double)();
+// CHUNKSIZE must be a multiple of SLICESIZE
+#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4)
+#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2)
+#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4)
+#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2)
+#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4)
+#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2)
+#define BROADCAST_SLICESTEPS 1
+#define BROADCAST_CHUNKSTEPS 1
+#define REDUCE_SLICESTEPS 1
+#define REDUCE_CHUNKSTEPS 1
+#define NCCL_MAX_SLICE_PER_CHUNK 2  // max value for CHUNKSTEPS/SLICESTEPS, must accord with above
+#define ALLTOALL_PIVOT_SLICESTEPS 2
+#define ALLTOALL_PIVOT_CHUNKSTEPS 4
+// We can't use the enum identifiers like ncclSum, ncclFloat, etc since this
+// macro will be used in preprocessor conditionals where enums have no meaning.
+#define NCCL_NVLS_SUPPORTS(/*ncclDataType_t*/ type, /*ncclDevRedOp_t*/ red) \
+  (((type==2 || type==3) && (red==0 || red==2 || red==3)) || \
+   ((type==4 || type==5) && (red==0 || red==2 || red==3)) || \
+   ((type==6 || type==9) && (red==0 || red==2 || red==3)) || \
+   (type==7 && red==0) || \
+   (type==8 && red==0))
+#endif
--- a/rccl/src/include/comm.h
+++ b/rccl/src/include/comm.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_COMM_H_
+#define NCCL_COMM_H_
+#include "transport.h"
+#include "p2p.h"
+#include "collectives.h"
+#include "proxy.h"
+#include "strongstream.h"
+#include <map>
+#include <chrono>
+#if defined (ENABLE_TIMELINE)
+#include "timeline/timeline.h"
+#endif
+#ifdef HYGON_SDMA_FEATURE
+#include "hsa_ext_amd.h"
+#include "hsa_extra.h"
+#define RCCL_SDMA_QUEUE_NUM 8
+#define RCCL_SDMA_QUEUE_DEPTH 64*4096
+#endif
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+#define HIPRT_CB
+#else
+#if CUDART_VERSION < 9000
+struct cudaLaunchParams {
+  void *func;
+  dim3 gridDim;
+  dim3 blockDim;
+  void **args;
+  size_t sharedMem;
+  cudaStream_t stream;
+};
+#endif
+#endif
+#define CACHE_LINE_SIZE 64
+#define MEM_ALIGN 4096
+#define CUDA_IPC_MIN 2097152UL
+// Channels / LL tuning
+#define NCCL_LL_THREAD_THRESHOLD 8
+#define NCCL_LL128_THREAD_THRESHOLD 8
+#define NCCL_SIMPLE_THREAD_THRESHOLD 64
+struct ncclSendMem {
+  union {
+    struct {
+      uint64_t head;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      void* ptrExchange;
+      uint64_t redOpArgExchange[2];
+      char pad2[CACHE_LINE_SIZE-sizeof(void*)-2*sizeof(uint64_t)];
+      int offsFifo[NCCL_STEPS];
+    };
+    char pad3[MEM_ALIGN];
+  };
+};
+struct ncclRecvMem {
+  union {
+    struct {
+      uint64_t tail;
+      char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)];
+      int sizesFifo[NCCL_STEPS];
+      int offsFifo[NCCL_STEPS];
+      int flush; // For GDRCopy-based flush
+    };
+    char pad4[MEM_ALIGN];
+  };
+};
+enum helperThreadState {ThreadStart, ThreadStop};
+#define NCCL_IPC_POOL_SIZE (2*NCCL_MAX_LOCAL_RANKS*NCCL_MAX_OPS)
+struct ncclGraphHelperResources {
+  ncclComm* comm;
+  pthread_mutex_t threadLock;
+  pthread_cond_t  threadCond;
+  enum helperThreadState threadState;
+  void* ipcBases[NCCL_IPC_POOL_SIZE];
+  int ipcTail;
+  int ipcHead;
+};
+struct ncclUserRedOp {
+  int freeNext; // -1=allocated, otherwise index of next free entry in array
+  ncclDataType_t datatype;
+  ncclDevRedOpFull opFull;
+};
+struct ncclNodeRanks {
+  int localRanks;
+  int* localRankToRank;
+};
+struct ncclDestructor {
+  struct ncclDestructor* next;
+  void* obj;
+  ncclResult_t(*fn)(struct ncclDestructor* me);
+};
+struct ncclCommCallback {
+  struct ncclCommCallback* next;
+  ncclResult_t(*fn)(struct ncclComm* comm, struct ncclCommCallback* cb);
+};
+struct ncclSharedResources {
+  int refCount;
+  struct ncclComm* owner; /* comm which creates this shared res. */
+  struct ncclChannelPeer* peers[MAXCHANNELS];
+  struct ncclDevChannelPeer* devPeers[MAXCHANNELS];
+  /* P2P operation counter, one per channel */
+  uint64_t p2pOpCount[MAXCHANNELS];
+  /* Collective operation counter */
+  uint64_t collOpCount;
+  int tpNRanks;
+  int tpNLocalRanks;
+  int tpNChannels;
+  int tpP2pNChannels;
+  int tpP2pChunkSize;
+  uint64_t magic;
+  // top parent rank to localRank translation table
+  int* tpRankToLocalRank;
+  // Internal streams
+  struct ncclStrongStream deviceStream, hostStream;
+  /* proxy related shared res */
+  struct ncclProxyState* proxyState;
+};
+struct ncclChannel {
+  struct ncclChannelPeer** peers;
+  struct ncclDevChannelPeer** devPeers;
+  struct ncclRing ring;
+  int* devRingUserRanks;
+  struct ncclTree tree;
+  struct ncclTree collnetChain;
+  struct ncclDirect collnetDirect;
+  struct ncclTree binTree;
+  struct ncclNvls nvls;
+  int id; // index of this channel
+  uint32_t workFifoSent; // last used work index+1
+  /* comm split sharable resources */
+  struct ncclChannelPeer* collnetPeers;
+  struct ncclDevChannelPeer* collnetDevPeers;
+  struct ncclChannelPeer* nvlsPeers;
+  struct ncclDevChannelPeer* nvlsDevPeers;
+#ifdef HYGON_SDMA_FEATURE
+  struct sdmaQueueInfo sdmaQueue;
+#endif
+  /* When using the mixedHylinkShm function, specify the channel transport types*/
+  int transportType;  
+};
+struct ncclWorkList {
+  struct ncclWorkList* next;
+  struct ncclWork work;
+};
+struct ncclPointerList {
+  struct ncclPointerList* next;
+  void *ptr;
+};
+struct ncclKernelPlan {
+  // A kernel plan is also a callback that reclaims itself. Hence this must
+  // be the first member.
+  struct ncclCommCallback reclaimer;
+  struct ncclMemoryPool memPool_ncclProxyOp; // memory to return to comm in cleanup
+  struct ncclComm* comm;
+  struct ncclKernelPlan* next;
+  bool persistent; // aka captured in a graph
+  bool kernelSpecialized;
+  void *kernelFn;
+  int channelUbound; // only channels c < channelUbound are present
+  int channelCount; // number of channels present
+  uint64_t channelMask; // which channels are present, channelCount == popcount(channelMask)
+  bool hasProxyOps; // does any channel have a non-empty proxyOpQueue
+  int threadPerBlock;
+  // workHeap fields are null until uploadWorkFifo() or preparePersistentKernel()
+  struct ncclWork* workHead;
+  int collOpCount; // zero based for this plan
+  struct ncclIntruQueue<struct ncclPointerList, &ncclPointerList::next> ipcMemQueue;
+  struct Channel {
+    int nWork;
+    union {
+      int nWorkElem; // used for coll and reg coll
+      int p2pTailElem[2]; // used for p2p, indexed by ncclWorkElemP2pType-1
+    };
+    size_t collBytes;
+    struct ncclIntruQueue<struct ncclWorkList, &ncclWorkList::next> workQueue;
+    struct ncclIntruQueue<struct ncclProxyOp, &ncclProxyOp::enqNext> proxyOpQueue;
+  } channels[MAXCHANNELS];
+};
+struct ncclComm {
+  ncclFunc_t func_coll;
+#if defined (ENABLE_TIMELINE)
+  struct ncclInfo *info;
+  Timeline * timeline;
+#endif
+  struct ncclMemoryStack memPermanent, memScoped;
+  // List of destructors to run when comm is destructed
+  struct ncclDestructor* destructorHead;
+  struct ncclSharedResources* sharedRes;
+  /* map to top parent ranks. */
+  int* topParentRanks;
+  int* topParentLocalRanks;
+  struct ncclChannel channels[MAXCHANNELS];
+  struct ncclPeerInfo* peerInfo;
+  struct ncclTopoSystem* topo;
+#ifdef HYGON_SDMA_FEATURE
+  bool sdmaCopyEnabe;
+  bool sdmaCountEnabe;
+  bool validHsaAgent;
+  uint32_t sdmaMinCopySize;
+  hsa_agent_t hsaAgent;
+  hsa_sdma_group_queue_t sdmaGroupQueue;
+#endif
+  ncclNet_t* ncclNet;
+  ncclCollNet_t* ncclCollNet;
+  void* bootstrap;
+  // Bitmasks for ncclTransportP2pSetup
+  uint64_t* connectSend;
+  uint64_t* connectRecv;
+  uint64_t magic; // Magic number for all network communication. Not a security key -- only goal is to detect mismatches.
+  uint64_t commHash;
+  int rank;    // my rank in the communicator
+  int nRanks;  // number of GPUs in communicator
+  int cudaDev; // my cuda device index
+  //int nvmlDev; // my nvml device index
+  int compCap; // compute capability of the GPU
+  int minCompCap, maxCompCap; // min/max compute capability in the communicator
+  int64_t busId;   // my PCI bus ID in int format
+  cpu_set_t cpuAffinity; // CPU affinity of the GPU
+  int WarpSize;
+  int cudaArch; // matches __CUDA_ARCH__ of device
+  int node;
+  int nNodes;
+  int localRank;
+  int localRanks;
+  int maxLocalRanks;
+  int* rankToNode;
+  int* rankToLocalRank;
+  int* localRankToRank;
+  // localRanks and localRanktoRank for all nodes
+  struct ncclNodeRanks* nodeRanks;
+  bool checkPointers;
+  bool dmaBufSupport;
+  // Counter for tracking CUDA launches (P2P and collectives included)
+  uint64_t opCount;
+  // Channels for collectives
+  int nChannels;
+  int nvlsChannels;
+  int collNetChannels;
+  // Channels (per peer) for p2p
+  int p2pnChannels;
+  int p2pnChannelsPerPeer;
+  int p2pChannels[MAXCHANNELS];
+  // Channels for mixed
+  int mixedTransportType;
+  int nMixedHylinkChannels;
+  // Should this comm allocate LL buffers for network P2P connections?
+  bool allocP2pNetLLBuffers;
+  // Buffer sizes
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  int p2pChunkSize;
+  // Algorithm/Protocols thresholds
+  ssize_t threadThresholds[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
+  /* This attribute can indicate the states of communicators and return code of
+   * asynchronous NCCL operations. */
+  ncclResult_t asyncResult;
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t *abortFlag;
+  volatile uint32_t *childAbortFlag;
+  uint32_t *abortFlagRefCount;
+  // Flags for enable P2P NET
+  uint32_t p2pNet;
+  uint32_t useIntraNet;
+  bool hasFineGrain;
+  // Device side of the communicator (for cudaFree's)
+  struct ncclDevComm* devComm; // actually = &ncclDevCommAndChannels::comm
+  // Operation pool.
+  int workFifoDepth; // size of workFifoHeap[], power of 2
+  struct ncclWork* workFifoHeap;
+  struct ncclWork* devWorkFifoHeap;
+  void* workFifoHeapGdrHandle;
+  // Work completion notificaion
+  uint32_t* workFifoDone/*[MAXCHANNELS]*/; // in cudaHost memory
+  uint32_t workFifoSent; // Monotonic (mod 1<<32) index of next unused fifo slot.
+  uint32_t workFifoAckdMin; // Monotonic index of least unprocessed fifo slot over all channels.
+  // Intra-process sync
+  struct ncclComm* intraComm0; // leader of intra-process comms (self possible)
+  struct ncclComm* intraNext; // next of intra-process comms, intraComm0 is head
+  int intraRank;
+  int intraRanks;
+  uint32_t intraBarrierPhase;
+  char intraPad1[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierCounter; // only used if this is intraComm0
+  char intraPad2[64 - sizeof(uint64_t)];
+  uint64_t intraBarrierGate; // only used if this is intraComm0
+  struct ncclProxyState* proxyState;
+  int proxyRefCountOld; /* store proxy post-atomic-sub refcount */
+  // Whether this communicator uses collNet
+  int collNetSupport;
+  uint8_t collNetSupportMatrix[4/*sum,prod,min,max*/][ncclNumTypes];
+  int intraHighestTransportType;
+  int* collNetHeads;
+  int collNetHeadsNum;
+  /* sharable collNet proxy progress resource. */
+  struct ncclCollNetSharedRes* collNetSharedRes;
+  // NVLink SHARP (NVLS) support
+  int nvlsSupport;
+  /* sharable NVLS resource. */
+  struct ncclNvlsSharedRes* nvlsResources;
+  size_t channelSize; // User requested work size (bytes) for channel partitions
+  // pools backed by comm->memPermanent
+  struct ncclMemoryPool memPool_ncclProxyOp;
+  struct ncclMemoryPool memPool_ncclKernelPlan;
+  struct ncclMemoryPool memPool_ncclPointerList;
+  // Next comm in this thread's active ncclGroup[Start|End](). Holds "0x1" when
+  // this comm is not yet in a group.
+  struct ncclComm* groupNext;
+  // Subset of those in groupNext list. Holds 0x1 if not needing preconnect.
+  struct ncclComm* preconnectNext;
+  int persistentRefs; // number of persistent plan-lists capturing this comm
+  struct ncclTasks tasks;
+  hipStream_t sideStream; // [RCCL] Cached non-captured stream
+  // user-created reduction ops
+  int userRedOpCapacity, userRedOpFreeHead;
+  ncclUserRedOp *userRedOps;
+  // Queue of things for the main thread to do
+  struct ncclIntruQueueMpsc<struct ncclCommCallback, &ncclCommCallback::next> callbackQueue;
+  // List of kernel plans built form tasks.
+  struct ncclIntruQueue<struct ncclKernelPlan, &ncclKernelPlan::next> planQueue;
+  // First of the unlaunched kernels in `planQueue`
+  struct ncclKernelPlan* unlaunchedPlansHead;
+  hipEvent_t doneEvent;
+  hipStream_t lastStream;
+#ifdef ENABLE_COLLTRACE
+  struct ncclCollTrace* collTrace;
+  union ncclCollTraceTail *collTraceTail;
+  pthread_t collTraceThread;
+  volatile bool collTraceExit;
+#endif
+  ncclConfig_t config;
+  // initState is to more conveniently reclaim resources when errors happen.
+  ncclResult_t initState;
+  // flag to indicate if ncclCommFinalize() is called
+  bool finalizeCalled;
+  // shared structures for finalization
+  int finalizeRankCnt;
+  // Whether this comm is compatible with MSCCL
+  bool mscclCompatible;
+  // Runtime tuner for algorithm and protocol selection
+  struct {
+    bool enabled;                    // Whether tuning is enabled
+    std::map<uint64_t, int>* workloadCache;  // workload hash -> best config index
+    // Current testing configuration
+    int currentAlgo;                 // NCCL_ALGO_RING/TREE/COLLNET
+    int currentProto;                // NCCL_PROTO_SIMPLE/LL/LL128
+    // Performance tracking
+    float bestTime;
+    int bestAlgo;
+    int bestProto;
+    // Search state
+    int searchStep;                  // Current search step
+    bool isSearching;                // Whether currently searching
+    uint64_t currentWorkloadHash;    // Current workload being tuned
+  } tuner;
+};
+enum ncclLaunchMode {
+  ncclLaunchModeInvalid=0,
+  ncclLaunchModeParallel,
+  ncclLaunchModeGroup
+};
+extern enum ncclLaunchMode ncclParamLaunchMode;
+void ncclCommPushFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaHostFree(struct ncclComm* comm, void* buf);
+void ncclCommPushCudaGdrFree(struct ncclComm* comm, void* handle);
+inline ncclResult_t ncclCommPollCallbacks(struct ncclComm* comm, bool waitSome) {
+  ncclResult_t result = ncclSuccess;
+  struct ncclCommCallback* cb = ncclIntruQueueMpscDequeueAll(&comm->callbackQueue, waitSome);
+  while (cb != nullptr) {
+    struct ncclCommCallback* next = cb->next;
+    ncclResult_t res1 = cb->fn(comm, cb); // may reclaim memory of cb
+    if (res1 != ncclSuccess) result = res1;
+    cb = next;
+  }
+  NCCLCHECK(result);
+  return ncclSuccess;
+}
+inline void ncclCommIntraBarrierIn(struct ncclComm* comm, uint32_t x) {
+  int phase = comm->intraBarrierPhase;
+  if (comm->intraRanks == 1) {
+    // Release everyone (just me).
+    comm->intraBarrierGate = (uint64_t(x)<<32) | (phase^1);
+  } else {
+    struct ncclComm* comm0 = comm->intraComm0;
+    uint64_t count = __atomic_add_fetch(&comm0->intraBarrierCounter, (uint64_t(x)<<32) + 1, __ATOMIC_RELEASE);
+    if (uint32_t(count) == uint32_t(comm->intraRanks)) {
+      // Reset.
+      __atomic_store_n(&comm0->intraBarrierCounter, 0, __ATOMIC_RELAXED);
+      // Release everyone.
+      __atomic_store_n(&comm0->intraBarrierGate, (count>>32<<32) | (phase^1), __ATOMIC_RELEASE);
+    }
+  }
+}
+// returns sum of x values contributed to ncclCommIntraBarrierIn(comm, x)
+inline uint32_t ncclCommIntraBarrierOut(struct ncclComm* comm) {
+  struct ncclComm* comm0 = comm->intraComm0;
+  comm->intraBarrierPhase ^= 1;
+  uint32_t phase = comm->intraBarrierPhase;
+  uint64_t gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+  if ((gate & 1) != phase) {
+    uint64_t t0 = clockNano();
+    do {
+      // Spin vigorously for first 5us.
+      if (clockNano()-t0 >= 5*1000) sched_yield();
+      gate = __atomic_load_n(&comm0->intraBarrierGate, __ATOMIC_RELAXED);
+    } while ((gate & 1) != phase);
+  }
+  if (comm->intraRanks != 1) __atomic_thread_fence(__ATOMIC_ACQUIRE);
+  return gate>>32;
+}
+// Scrambles the bits of non-builtin values of ncclRedOp_t according to the
+// communicator memory address. Used to catch bugs so that integer handles
+// associated with this communicator won't collide with handles of other
+// communicatrs. This function is its own inverse.
+static inline ncclRedOp_t ncclUserRedOpMangle(ncclComm *comm, ncclRedOp_t op) {
+  // Preserve the built-in values.
+  if(int(op) < int(ncclNumOps))
+    return op;
+  uint64_t h = reinterpret_cast<uint64_t>(comm);
+  h ^= h >> 32;
+  h *= 0x9e3779b97f4a7c13u; // Knuth's 64-bit magical hash constant
+  h >>= 32; // h is now an excellent 32-bit hash of the comm pointer
+  h &= int(ncclMaxRedOp); // ncclMaxRedOp is a power of 2 minus 1
+  int op1 = int(h) ^ int(op);
+  // Since builtin values are preserved, we also have to preserve their preimage.
+  return op1 < int(ncclNumOps) ? op : ncclRedOp_t(op1);
+}
+ncclResult_t ncclCommEnsureReady(ncclComm_t comm);
+ncclResult_t ncclCommSetAsyncError(ncclComm_t comm, ncclResult_t nextState);
+#endif
--- a/rccl/src/include/core.h
+++ b/rccl/src/include/core.h
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_CORE_H_
+#define NCCL_CORE_H_
+#include <pthread.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <algorithm> // For std::min/std::max
+#include "nccl.h"
+#ifdef PROFAPI
+#define NCCL_API(ret, func, args...)        \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((alias(#func)))          \
+    ret p##func (args);                     \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    __attribute__ ((weak))                  \
+    ret func(args)
+#else
+#define NCCL_API(ret, func, args...)        \
+    extern "C"                              \
+    __attribute__ ((visibility("default"))) \
+    ret func(args)
+#endif // end PROFAPI
+static __inline__ int ncclTypeSize(ncclDataType_t type) {
+  switch (type) {
+    case ncclInt8:
+    case ncclUint8:
+      return 1;
+    case ncclFloat16:
+#if defined(RCCL_BFLOAT16)
+    case ncclBfloat16:
+#endif
+      return 2;
+    case ncclInt32:
+    case ncclUint32:
+    case ncclFloat32:
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclFloat64:
+      return 8;
+    default:
+      return -1;
+  }
+}
+#include "debug.h"
+#include "checks.h"
+#include "rocmwrap.h"
+#include "alloc.h"
+#include "utils.h"
+#include "param.h"
+#include "nvtx_stub.h"
+#endif // end include guard
--- a/rccl/src/include/cpuset.h
+++ b/rccl/src/include/cpuset.h
+/*************************************************************************
+ * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_CPUSET_H_
+#define NCCL_CPUSET_H_
+// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t
+static int hexToInt(char c) {
+  int v = c - '0';
+  if (v < 0) return -1;
+  if (v > 9) v = 10 + c - 'a';
+  if ((v < 0) || (v > 15)) return -1;
+  return v;
+}
+#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t))
+static ncclResult_t ncclStrToCpuset(const char* str, cpu_set_t* mask) {
+  uint32_t cpumasks[CPU_SET_N_U32];
+  int m = CPU_SET_N_U32-1;
+  cpumasks[m] = 0;
+  for (int o=0; o<strlen(str); o++) {
+    char c = str[o];
+    if (c == ',') {
+      m--;
+      cpumasks[m] = 0;
+    } else {
+      int v = hexToInt(c);
+      if (v == -1) break;
+      cpumasks[m] <<= 4;
+      cpumasks[m] += v;
+    }
+  }
+  // Copy cpumasks to mask
+  for (int a=0; m<CPU_SET_N_U32; a++,m++) {
+    memcpy(((uint32_t*)mask)+a, cpumasks+m, sizeof(uint32_t));
+  }
+  return ncclSuccess;
+}
+static ncclResult_t ncclCpusetToStr(cpu_set_t* mask, char* str) {
+  int c = 0;
+  uint8_t* m8 = (uint8_t*)mask;
+  for (int o=sizeof(cpu_set_t)-1; o>=0; o--) {
+    if (c == 0 && m8[o] == 0) continue;
+    sprintf(str+c, "%02x", m8[o]);
+    c+=2;
+    if (o && o%4 == 0) {
+      sprintf(str+c, ",");
+      c++;
+    }
+  }
+  str[c] = '\0';
+  return ncclSuccess;
+}
+#endif
--- a/rccl/src/include/cudawrap.h
+++ b/rccl/src/include/cudawrap.h
+/*************************************************************************
+ * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_CUDAWRAP_H_
+#define NCCL_CUDAWRAP_H_
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "checks.h"
+// Is cuMem API usage enabled
+extern int ncclCuMemEnable();
+#if CUDART_VERSION >= 11030
+#include <cudaTypedefs.h>
+#else
+typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+#endif
+#define CUPFN(symbol) pfn_##symbol
+// Check CUDA PFN driver calls
+#define CUCHECK(cmd) do {				      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure '%s'", errStr);		      \
+      return ncclUnhandledCudaError;			      \
+    }							      \
+} while(false)
+#define CUCHECKGOTO(cmd, res, label) do {		      \
+    CUresult err = pfn_##cmd;				      \
+    if( err != CUDA_SUCCESS ) {				      \
+      const char *errStr;				      \
+      (void) pfn_cuGetErrorString(err, &errStr);	      \
+      WARN("Cuda failure '%s'", errStr);		      \
+      res = ncclUnhandledCudaError;			      \
+      goto label;					      \
+    }							      \
+} while(false)
+// Report failure but clear error and continue
+#define CUCHECKIGNORE(cmd) do {						\
+    CUresult err = pfn_##cmd;						\
+    if( err != CUDA_SUCCESS ) {						\
+      const char *errStr;						\
+      (void) pfn_cuGetErrorString(err, &errStr);			\
+      INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, errStr);	\
+    }									\
+} while(false)
+#define CUCHECKTHREAD(cmd, args) do {					\
+    CUresult err = pfn_##cmd;						\
+    if (err != CUDA_SUCCESS) {						\
+      INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, err); \
+      args->ret = ncclUnhandledCudaError;				\
+      return args;							\
+    }									\
+} while(0)
+#define DECLARE_CUDA_PFN_EXTERN(symbol,version) extern PFN_##symbol##_v##version pfn_##symbol
+#if CUDART_VERSION >= 11030
+/* CUDA Driver functions loaded with cuGetProcAddress for versioning */
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGet, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuDeviceGetAttribute, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorString, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuGetErrorName, 6000);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAddressRange, 3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxCreate, 3020);
+DECLARE_CUDA_PFN_EXTERN(cuCtxDestroy, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxGetCurrent, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxSetCurrent, 4000);
+DECLARE_CUDA_PFN_EXTERN(cuCtxGetDevice, 2000);
+// cuMem API support
+DECLARE_CUDA_PFN_EXTERN(cuMemAddressReserve, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemAddressFree, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemCreate, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemGetAllocationGranularity, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemExportToShareableHandle, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemImportFromShareableHandle, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemMap, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRelease, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemRetainAllocationHandle, 11000);
+DECLARE_CUDA_PFN_EXTERN(cuMemSetAccess, 10020);
+DECLARE_CUDA_PFN_EXTERN(cuMemUnmap, 10020);
+#if CUDA_VERSION >= 11070
+DECLARE_CUDA_PFN_EXTERN(cuMemGetHandleForAddressRange, 11070); // DMA-BUF support
+#endif
+#if CUDA_VERSION >= 12010
+/* NVSwitch Multicast support */
+DECLARE_CUDA_PFN_EXTERN(cuMulticastAddDevice, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastBindMem, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastBindAddr, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastCreate, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastGetGranularity, 12010);
+DECLARE_CUDA_PFN_EXTERN(cuMulticastUnbind, 12010);
+#endif
+#endif
+/* CUDA Driver functions loaded with dlsym() */
+DECLARE_CUDA_PFN_EXTERN(cuInit, 2000);
+DECLARE_CUDA_PFN_EXTERN(cuDriverGetVersion, 2020);
+DECLARE_CUDA_PFN_EXTERN(cuGetProcAddress, 11030);
+ncclResult_t ncclCudaLibraryInit(void);
+extern int ncclCudaDriverVersionCache;
+extern bool ncclCudaLaunchBlocking; // initialized by ncclCudaLibraryInit()
+inline ncclResult_t ncclCudaDriverVersion(int* driver) {
+  int version = __atomic_load_n(&ncclCudaDriverVersionCache, __ATOMIC_RELAXED);
+  if (version == -1) {
+    CUDACHECK(cudaDriverGetVersion(&version));
+    __atomic_store_n(&ncclCudaDriverVersionCache, version, __ATOMIC_RELAXED);
+  }
+  *driver = version;
+  return ncclSuccess;
+}
+#endif
--- a/rccl/src/include/debug.h
+++ b/rccl/src/include/debug.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_DEBUG_H_
+#define NCCL_DEBUG_H_
+#include "nccl_net.h"
+#include <stdio.h>
+#include <chrono>
+#include <type_traits>
+#include <limits.h>
+#include <string.h>
+#include <pthread.h>
+// Conform to pthread and NVTX standard
+#define NCCL_THREAD_NAMELEN 16
+extern int ncclDebugLevel;
+extern uint64_t ncclDebugMask;
+extern pthread_mutex_t ncclDebugLock;
+extern FILE *ncclDebugFile;
+extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
+void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
+// Let code temporarily downgrade WARN into INFO
+extern thread_local int ncclDebugNoWarn;
+extern char ncclLastError[];
+#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__)
+#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+#define TRACE_CALL(...) ncclDebugLog(NCCL_LOG_TRACE, NCCL_CALL, __func__, __LINE__, __VA_ARGS__)
+#ifdef ENABLE_TRACE
+#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__)
+extern std::chrono::steady_clock::time_point ncclEpoch;
+#else
+#define TRACE(...)
+#endif
+void ncclSetThreadName(pthread_t thread, const char *fmt, ...);
+#endif
--- a/rccl/src/include/devcomm.h
+++ b/rccl/src/include/devcomm.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_DEVICE_H_
+#define NCCL_DEVICE_H_
+#include "nccl.h"
+#include "rccl_bfloat16.h"
+#include "align.h"
+#if defined(ENABLE_NPKIT)
+#include "npkit/npkit_struct.h"
+#endif
+#if defined (ENABLE_TIMELINE)
+#include "timeline/timeline.h"
+#endif
+#include <stdint.h>
+#ifdef HYGON_SDMA_FEATURE
+#include "hsa_ext_amd.h"
+#include "hsa_extra.h"
+#define PRINT_ERR(...)          
+#define PRINT_INFO(...)         
+#define PRINT_INFOM(...)        
+#define PRINT_INFOT(tid, ...)   
+#define PRINT_DEBUG(...)
+#else
+#define PRINT_ERR(...)
+#define PRINT_INFO(...)
+#define PRINT_INFOM(...)
+#define PRINT_INFOT(tid, ...)
+#define PRINT_DEBUG(...)
+#endif
+#if defined(ENABLE_NPKIT) && defined(HYGON_SDMA_FEATURE)
+#define NPKIT_SET_GPU_EVENT(event, size, cost) \
+          NpKit::CollectGpuEvent(event, size, cost, NPKIT_GET_GPU_TIMESTAMP(), ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 
+#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm) \
+          NpKit::CollectGpuEvent(event, size, cost, tm, ncclShmem.comm.npKitEventCollectContexts + npKitCtxIdx); 
+#else
+#define NPKIT_SET_GPU_EVENT(event, size, cost)
+#define NPKIT_SET_GPU_EVENT_TM(event, size, cost, tm) 
+#endif
+#ifdef HYGON_SDMA_FEATURE
+#define INIT_PRIMS_SDMA(prims, args)                                   \
+  {                                                                    \
+    prims.useSdmaCopy = args->useSdma;                                 \
+    prims.sdmaMinCopySize = ncclShmem.channel.sdmaQueue.minCopySize;   \
+    prims.sdmaCountEnabe = ncclShmem.channel.sdmaQueue.copyCountEnabe; \
+    prims.sdmaCopyCount = 0;                                           \
+    prims.allCopyCount = 0;                                            \
+  }
+#endif
+#define NCCL_NUM_FUNCTIONS 5 // SendRecv and AllToAllPivot not included for now
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclFuncAllToAllPivot, ncclNumFuncs} ncclFunc_t;
+extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2];
+#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
+#define NCCL_ALGO_TREE 0
+#define NCCL_ALGO_RING 1
+#define NCCL_ALGO_COLLNET_DIRECT 2
+#define NCCL_ALGO_COLLNET_CHAIN 3
+#define NCCL_ALGO_NVLS 4
+#define NCCL_ALGO_NVLS_TREE 5
+extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
+#define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
+#define NCCL_PROTO_LL 0
+#define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_SIMPLE 2
+extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];
+#define NCCL_MAX_OPS 2048
+#define NCCL_STEPS 8
+union ncclLLFifoLine {
+  /* Flags have to be *after* data, because otherwise, an incomplete receive
+     from the network may receive the flag but not the data.
+     Note this is assuming that either we receive contiguous chunks of data
+     (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */
+  struct {
+    uint32_t data1;
+    uint32_t flag1;
+    uint32_t data2;
+    uint32_t flag2;
+  };
+  uint64_t v[2];
+  int4 i4;
+};
+#define WARP_SIZE warpSize
+#define MAXCHANNELS 32
+#define NCCL_MAX_NTHREADS 256
+#define NCCL_SIMPLE_MAX_NTHREADS NCCL_MAX_NTHREADS
+#define NCCL_LL_MAX_NTHREADS NCCL_MAX_NTHREADS
+#define NCCL_LL_LINES_PER_THREAD 8
+#ifdef TEST_LL_CLEANUP
+#define NCCL_LL_CLEAN_MASK 0x078 // Set to 0x100 to disable cleanup
+#define NCCL_LL_FLAG_MAX   0x100
+#define NCCL_LL_FLAG(a) ((uint32_t)((a) % NCCL_LL_FLAG_MAX))
+#else
+#define NCCL_LL_CLEAN_MASK 0x7ffffff8
+#define NCCL_LL_FLAG(a) ((uint32_t)(a))
+#endif
+// Make sure the clean mask will last for at least NCCL_NSTEPS
+static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value");
+#define NCCL_LL128_LINESIZE 64
+#define NCCL_LL128_LINEELEMS (NCCL_LL128_LINESIZE/sizeof(uint64_t))
+#define NCCL_LL128_DATAELEMS (NCCL_LL128_LINEELEMS-1)
+#define NCCL_LL128_MAX_NTHREADS 256
+#define NCCL_LL128_ELEMS_PER_THREAD 28
+#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
+#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
+#define NCCL_DIRECT_WRITE 0x01
+#define NCCL_DIRECT_READ  0x02
+#define NCCL_DIRECT_NIC   0x04
+#define NCCL_IPC_WRITE    0x08
+#define NCCL_IPC_READ     0x10
+#define NCCL_NVLS_MIN_POLL 0x20
+struct ncclConnInfo {
+  // Regular comm mechanism
+  char *buffs[NCCL_NUM_PROTOCOLS]; // Local for recv, remote for send
+  uint64_t *tail;     // Local for recv, remote for send
+  uint64_t *head;     // Local for send, remote for recv
+  int flags;          // Direct communication / other flags
+  int shared;         // Buffers are shared
+  void **ptrExchange; // Pointer exchange for direct communication
+  uint64_t* redOpArgExchange; // PreOp scaler exchange for direct pull case
+  int *sizesFifo;     // Sizes fifo from GPU to proxy
+  int *offsFifo;      // Buffer fifo from proxy to GPU
+  uint64_t step;      // Keep where we are
+  uint64_t llLastCleaning;
+  // GPU's HDP_MEM_FLUSH_ADDR: HDP Memory Coherency Flush Control. This register
+  // allows software to explicitly initiate a flush read to HDP memory. See more
+  // descriptions in primitives.h.
+  uint32_t* next_hdp_reg;  // Next GPU in ring (for p2p transport use only)
+  uint32_t* curr_hdp_reg;  // Current GPU's HDP register
+};
+struct ncclProxyConnector {
+  int tpRank;
+  int tpLocalRank;
+  int sameProcess;
+  struct ncclProxyConnection* connection;
+};
+struct ncclConnector {
+  int connected;
+  struct ncclProxyConnector proxyConn;
+  struct ncclTransportComm* transportComm;
+  void* transportResources;
+  struct ncclConnInfo conn;
+};
+struct ncclRing {
+  // Shortcuts for userRanks[1] and userRanks[n-1]
+  int prev;
+  int next;
+  // Maps an internal nccl index to user-specified rank order. This is necessary
+  // since we need to know how the user expects data to be ordered across
+  // devices. Ordered from current device.
+  int* userRanks;
+  int index; // This rank's index in the ring
+};
+// The root of each tree only has one node down (+1 intra-node).
+#define NCCL_MAX_TREE_ARITY_TOP 2
+// Nodes inside the binary tree can have to two nodes down (+1 intra-node).
+#define NCCL_MAX_TREE_ARITY 3
+struct ncclTree {
+  int depth;
+  int up;
+  int down[NCCL_MAX_TREE_ARITY];
+};
+#define NCCL_MAX_DIRECT_ARITY 7
+struct ncclDirect {
+  int depth;
+  int out;
+  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
+  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
+  int shift;    // Shuffling of send/recv for scatter/gather operations, basically localRank%nHeads
+  int up[NCCL_MAX_DIRECT_ARITY];
+  int down[NCCL_MAX_DIRECT_ARITY];
+};
+#define NCCL_CONN_IDX_P2P_NET 2
+#define NCCL_MAX_NVLS_ARITY 8
+#define NCCL_MAX_NVLS_TREE_ARITY 3
+struct ncclNvls {
+  int out;
+  int nHeads;   // Number of parallel N<->1<->net operations we'll do in parallel; size of up/down
+  int headRank; // Index in 0..nHeads-1 I am the head rank of. -1 if I'm not a head rank (no local NIC)
+  int up[NCCL_MAX_NVLS_ARITY];
+  int down;
+  int treeUp;
+  int treeDown[NCCL_MAX_NVLS_TREE_ARITY];
+  int node;
+  int nNodes;
+};
+#define NCCL_MAX_CONNS 3
+struct ncclChannelPeer {
+  struct ncclConnector send[NCCL_MAX_CONNS];
+  struct ncclConnector recv[NCCL_MAX_CONNS];
+  int refCount;
+};
+struct ncclDevComm;
+#pragma pack(push)  /* push current alignment to stack */
+#pragma pack(8)     /* set alignment to 8 bytes boundary */
+/* ncclWork is to be a power of two, currently 8x64 bytes, */
+/* to make sure reads to host from the CUDA kernel are aligned. */
+/* Make sure to adjust padding at the end of ncclWorkElem. */
+#define NCCL_WORK_SIZE 256
+enum ncclWorkType : uint8_t {
+   ncclWorkTypeUnused=0,
+   ncclWorkTypeColl=1,
+   ncclWorkTypeP2p=2,
+   ncclWorkTypeRegColl=3
+};
+enum ncclWorkP2PType : uint8_t {
+  ncclWorkP2pTypeUnused=0,
+  ncclWorkP2pTypeSend,
+  ncclWorkP2pTypeRecv
+};
+struct ncclWorkHeader {
+  union {
+    int32_t workNext;  // when isLast=0: Offset from kernel argument workHead
+    uint32_t doneAcks; // when isLast=1: Monotonic (mod 1<<32) ack value to send back.
+  };
+  uint16_t funcIndex;
+  uint8_t isLast:1; // last work for this kernel
+  uint8_t inFifo:1; // is this work in the fifo
+  enum ncclWorkType type;
+};
+struct ncclWorkElem {
+  union {
+    uint8_t flagBits;
+    struct {
+      uint8_t isUsed:1, redOpArgIsPtr:1, regUsed:1, nWarps:5;
+    };
+  };
+  uint8_t direct;
+  uint8_t bid;
+  uint8_t nChannels;
+  struct {
+    uint32_t root:28;
+    uint32_t useSdma:2;
+    uint32_t connIndex:2;
+  };
+  const void * sendbuff;
+  void * recvbuff;
+  size_t count;
+  union {
+    size_t lastChunkSize;
+    // Pivot A2A kernel computes chunk size itself.
+    // Instead, it needs the number of bidirectional rings.
+    size_t pivotA2ANumBiRings;
+  };
+  uint64_t redOpArg;
+  uint64_t opCount;
+};
+static_assert((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElem)))/sizeof(ncclWorkElem) == 4, "Sanity check: NCCL_MAX_WORK_ELEMENTS == 4");
+#define NCCL_MAX_WORK_ELEMENTS 1
+struct ncclWorkElemP2p {
+  struct {
+    int32_t peer:26;
+    uint32_t useSdma:2;
+    uint32_t connIndex:2;
+    int32_t proto:2;
+  };
+  union {
+    uint16_t flagBits;
+    struct {
+      enum ncclWorkP2PType p2pType:4;
+      uint16_t nWarps:4;
+      uint16_t warpStart:4;
+      uint16_t ngroups:4;
+    };
+  };
+  uint16_t opCount;
+  // Important not to use any fields with greater than 4-byte alignment since
+  // we need sizeof(ncclWorkElemP2p)==28, but that would be padded up to 32 if
+  // there were 8-byte fields.
+  //void* buff;
+  uint32_t buffHi32, buffLo32; // buff = buffHi32<<32 | buffLo32;
+  //size_t count;
+  uint32_t countHi32, countLo32; // count = countHi32<<32 | countLo32;
+  int chunkSize;
+};
+static_assert(((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemP2p)))/sizeof(ncclWorkElemP2p)) == 8, "Sanity check: NCCL_MAX_WORK_ELEMENTS_P2P == 8");
+#define NCCL_MAX_WORK_ELEMENTS_P2P 2
+struct ncclWorkElemReg {
+  struct ncclWorkElem elem;
+  void* dnInputs[NCCL_MAX_DIRECT_ARITY+1];
+  void* dnOutputs[NCCL_MAX_DIRECT_ARITY+1];
+  void* upOutputs[NCCL_MAX_DIRECT_ARITY+1];
+};
+#define NCCL_MAX_WORK_ELEMENTS_REG ((NCCL_WORK_SIZE - alignUp(sizeof(ncclWorkHeader), alignof(ncclWorkElemReg)))/sizeof(ncclWorkElemReg))
+static_assert(NCCL_MAX_WORK_ELEMENTS_REG == 1, "Sanity check: NCCL_MAX_WORK_ELEMENTS_REG == 1");
+// Number of named barriers supported by CUDA
+#define NCCL_MAX_GROUPS (NCCL_MAX_NTHREADS/WARP_SIZE)
+struct ncclWork {
+  struct ncclWorkHeader header;
+  union {
+    char pad[NCCL_WORK_SIZE - sizeof(struct ncclWorkHeader)];
+    struct ncclWorkElem elems[NCCL_MAX_WORK_ELEMENTS];
+    struct ncclWorkElemP2p p2pElems[NCCL_MAX_WORK_ELEMENTS_P2P];
+    struct ncclWorkElemReg regElems[NCCL_MAX_WORK_ELEMENTS_REG];
+  };
+};
+static_assert(sizeof(struct ncclWork) == NCCL_WORK_SIZE, "Sanity check: sizeof(struct ncclWork) == NCCL_WORK_SIZE");
+static_assert(sizeof(struct ncclWork)%16 == 0, "Sanity check: sizeof(struct ncclWork)%16 == 0");
+struct ncclDevChannelPeer {
+  // Stripped version of ncclChannelPeer where we only keep the ncclConnInfo
+  // instead of the full ncclConnector.
+  struct ncclConnInfo send[NCCL_MAX_CONNS];
+  struct ncclConnInfo recv[NCCL_MAX_CONNS];
+};
+#pragma pack(pop)   /* restore original alignment from stack */
+#ifdef ENABLE_PROFILING
+#define PROFILE_NUM_ITEMS 31
+#define PROFILE_NUM_LAUNCHES 1024
+struct ncclProf {
+  uint32_t count;
+  uint32_t seq; // only entry from first launch is used
+  struct {
+    uint64_t line:16;
+    uint64_t timeStamp:48;
+  } elem[PROFILE_NUM_ITEMS];
+};
+static_assert(sizeof(struct ncclProf) == 256, "ncclProf must have size of 256");
+#endif
+#ifdef ENABLE_COLLTRACE
+typedef enum {
+  ncclCollTraceNotReady = 0,
+  ncclCollTraceKernelLaunchType = 1,
+  ncclCollTraceKernelEndType = 2,
+  ncclCollTraceCollLaunchType = 3,
+  ncclCollTraceAbortType = 4,
+  ncclCollTraceDataType = 5,
+  ncclCollTraceCollElemType = (1<<4),
+  ncclCollTraceP2pElemType = (1<<5),
+} ncclCollTraceDataType_t;
+struct ncclCollTrace {
+  uint8_t type;
+  uint8_t bid;
+  int16_t funcIndex;
+  uint32_t data_0;
+  uint64_t timeStamp;
+  union {
+    uint64_t opCount;
+    uint32_t p2pOpCount[2];
+  };
+  union {
+    uint64_t data_1;
+    struct {
+      uint8_t nWarps;
+      uint8_t bid;
+      uint8_t nChannels;
+    } coll;
+    struct {
+      int16_t peer;
+      uint8_t ngroups:4;
+      uint8_t connIndex:4;
+      uint8_t warpStart:4;
+      uint8_t nWarps:4;
+    } p2p[2];
+  };
+};
+static_assert(sizeof(struct ncclCollTrace) == 8*sizeof(int), "ncclCollTrace must have a pow2 size");
+union ncclCollTraceTail{
+  uint32_t tail;
+  char padding[4096];
+};
+#define COLLTRACE_NUM_ITEMS 8192
+#endif
+#ifdef HYGON_SDMA_FEATURE
+struct sdmaQueueInfo {
+  hsa_sdma_info_t *sdmaInfo;
+  uint32_t *pkgIndex;
+  uint32_t minCopySize;
+  uint32_t copyCountEnabe;
+  uint32_t sdmaDepth;
+  uint32_t *ptrSdmaCopyCount;
+  uint32_t *ptrAllCopyCount;
+};
+#endif
+struct alignas(16) ncclDevChannel {
+  struct ncclDevChannelPeer** peers;
+  struct ncclRing ring;
+  struct ncclTree tree;
+  struct ncclTree collnetChain;
+  struct ncclDirect collnetDirect;
+  struct ncclTree binTree;
+  struct ncclNvls nvls;
+  uint32_t* workFifoDone; // Location of done counter, device writes index+1 of last work processed
+#ifdef HYGON_SDMA_FEATURE
+  union {
+    struct sdmaQueueInfo sdmaQueue;
+    uint32_t rvsd[12];
+  };
+#endif
+};
+struct ncclDevComm {
+  int rank;
+  int nRanks;
+  int buffSizes[NCCL_NUM_PROTOCOLS];
+  // Operation list for aggregation
+  int workFifoDepth;
+  struct ncclWork* workFifoHeap; // may be cudaHost or GDR memory
+  // Flag to ask NCCL kernels to abort
+  volatile uint32_t* abortFlag;
+  // Channels, device side
+  struct ncclDevChannel* channels/*[MAXCHANNELS]*/;
+#if defined(ENABLE_NPKIT)
+  NpKitEventCollectContext* npKitEventCollectContexts;
+#endif
+#ifdef ENABLE_COLLTRACE
+  struct ncclCollTrace* collTrace;
+  union ncclCollTraceTail *collTraceTail;
+  pthread_t collTraceThread;
+#endif
+#ifdef ENABLE_PROFILING
+  struct ncclProf* devProf;
+#endif
+#if defined (ENABLE_TIMELINE)
+  TimelineGpuEventContext* gpuEventContext;
+#endif
+#if defined (ENABLE_NPKIT) || defined (ENABLE_TIMELINE)
+  uint64_t* cpuTimestamp;
+#endif 
+#ifdef HYGON_SDMA_FEATURE
+uint32_t sdmaPkgIndex[8];
+uint32_t sdmaCopyCount[MAXCHANNELS];
+uint32_t allCopyCount[MAXCHANNELS];
+#endif
+};
+struct alignas(16) ncclDevCommAndChannels {
+  struct ncclDevComm comm;
+  struct ncclDevChannel channels[MAXCHANNELS];
+};
+#ifdef __CUDA_ARCH__
+  #define NCCL_CUDA_ARCH __CUDA_ARCH__
+#else
+  #define NCCL_CUDA_ARCH 0
+#endif
+template<typename T>
+__host__ __device__ constexpr T min_constexpr(T a) { return a; }
+template<typename T, typename ...Ts>
+__host__ __device__ constexpr T min_constexpr(T a, T b, Ts ...c) {
+  return min_constexpr<T>((a < b ? a : b), c...);
+}
+template<typename T>
+__host__ __device__ constexpr T max_constexpr(T a) { return a; }
+template<typename T, typename ...Ts>
+__host__ __device__ constexpr T max_constexpr(T a, T b, Ts ...c) {
+  return max_constexpr<T>((a > b ? a : b), c...);
+}
+// Calculate the unroll factor given:
+// * bytePerPack: number of bytes accessed per instruction
+// * insns: max permissible unroll value
+// * bytes: desired number of in-flight bytes per iteration ( = unroll*bytePerPack)
+__host__ __device__ constexpr int ncclCalcUnroll(int bytePerPack, int insns, int bytes) {
+  return min_constexpr(insns, (bytes + bytePerPack-1)/bytePerPack);
+}
+// Note that all unroll value logic should depend on a given cudaArch argument
+// and not __CUDA_ARCH__ since these need to be host-side executable where the
+// arch value is strictly runtime only. By defaulting to NCCL_CUDA_ARCH, device
+// side code can elide passing the arch for brevity.
+__host__ __device__ constexpr int ncclCollUnroll(int cudaArch = NCCL_CUDA_ARCH) {
+  // Our collective unroll should move to the same bytes&insns model as NVLS.
+  return cudaArch >= 800 ? 8 : 4;
+}
+__host__ __device__ constexpr int ncclNvlsUnrollBytes(int cudaArch = NCCL_CUDA_ARCH) { return 4*16; }
+__host__ __device__ constexpr int ncclNvlsUnrollInsns(int cudaArch = NCCL_CUDA_ARCH) { return 16; }
+__host__ __device__ constexpr int ncclNvlsUnroll(int bytePerPack, int cudaArch = NCCL_CUDA_ARCH) {
+  return ncclCalcUnroll(bytePerPack, ncclNvlsUnrollInsns(cudaArch), ncclNvlsUnrollBytes(cudaArch));
+}
+// The amount of dynamic shmem per warp
+__host__ __device__ constexpr int ncclShmemScratchWarpSize(int cudaArch = NCCL_CUDA_ARCH) {
+  return (max_constexpr<int>(
+      /*LL    */0,
+      /*LL128 */(NCCL_LL128_SHMEM_ELEMS_PER_THREAD*WARP_SIZE)*sizeof(uint64_t),
+      /*SIMPLE*/(ncclCollUnroll(cudaArch)*WARP_SIZE + 1)*16,
+      // NVLS needs an extra 16B to read unaligned data.
+      /*NVLS  */WARP_SIZE*(cudaArch >= 900 ? ncclNvlsUnrollBytes(cudaArch) : 0) + 16
+    ) + 15) & -16; // pad to 16 bytes
+}
+// The amount of dynamic shmem per block
+__host__ __device__ constexpr int ncclShmemDynamicSize(int cudaArch = NCCL_CUDA_ARCH) {
+  return cudaArch < 700 ? 0 : ncclShmemScratchWarpSize(cudaArch)*(NCCL_MAX_NTHREADS/WARP_SIZE);
+}
+#endif
--- a/rccl/src/include/enqueue.h
+++ b/rccl/src/include/enqueue.h
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_ENQUEUE_H_
+#define NCCL_ENQUEUE_H_
+#include "comm.h"
+#include "group.h"
+#include "collectives.h"
+#include "utils.h"
+#define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64)
+#define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */
+ncclResult_t ncclInitKernelsForDevice(int cudaArch, size_t* maxStackSize);
+ncclResult_t ncclEnqueueCheck(struct ncclInfo* info);
+ncclResult_t ncclLaunchPrepare(struct ncclComm* comm);
+ncclResult_t ncclLaunchKernelBefore_NoUncapturedCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernel(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchKernelAfter_NoCuda(struct ncclComm* comm, struct ncclKernelPlan* plan);
+ncclResult_t ncclLaunchFinish(struct ncclComm* comm);
+#endif // End include guard
--- a/rccl/src/include/gdrwrap.h
+++ b/rccl/src/include/gdrwrap.h
+/*************************************************************************
+ * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_GDRWRAP_H_
+#define NCCL_GDRWRAP_H_
+#include "nccl.h"
+#include <stdint.h> // for standard [u]intX_t types
+#include <stdio.h>
+#include <stdlib.h>
+// These can be used if the GDR library isn't thread safe
+#include <pthread.h>
+extern pthread_mutex_t gdrLock;
+#define GDRLOCK() pthread_mutex_lock(&gdrLock)
+#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock)
+#define GDRLOCKCALL(cmd, ret) do {                      \
+    GDRLOCK();                                          \
+    ret = cmd;                                          \
+    GDRUNLOCK();                                        \
+} while(false)
+#define GDRCHECK(cmd) do {                              \
+    int e;                                              \
+    /* GDRLOCKCALL(cmd, e); */                          \
+    e = cmd;                                            \
+    if( e != 0 ) {                                      \
+      WARN("GDRCOPY failure %d", e);                    \
+      return ncclSystemError;                           \
+    }                                                   \
+} while(false)
+// This is required as the GDR memory is mapped WC
+#if !defined(__NVCC__)
+#if defined(__PPC__)
+static inline void wc_store_fence(void) { asm volatile("sync") ; }
+#elif defined(__x86_64__)
+#include <immintrin.h>
+static inline void wc_store_fence(void) { _mm_sfence(); }
+#elif defined(__aarch64__)
+#ifdef __cplusplus
+#include <atomic>
+static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); }
+#else
+#include <stdatomic.h>
+static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); }
+#endif
+#endif
+#endif
+//#define GDR_DIRECT 1
+#ifdef GDR_DIRECT
+// Call the GDR API library code directly rather than via
+// dlopen() wrappers
+#include <gdrapi.h>
+static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; }
+static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; }
+static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; }
+static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) {
+  GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) {
+  GDRCHECK(gdr_unpin_buffer(g, handle));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) {
+  GDRCHECK(gdr_get_info(g, handle, info));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) {
+  GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) {
+  GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size));
+  return ncclSuccess;
+}
+static void wrap_gdr_runtime_get_version(int *major, int *minor) {
+  gdr_runtime_get_version(major, minor);
+  return ncclSuccess;
+}
+static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) {
+  gdr_driver_get_version(g, major, minor);
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) {
+  GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size));
+  return ncclSuccess;
+}
+static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) {
+  GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size));
+  return ncclSuccess;
+}
+#else
+// Dynamically handle dependency the GDR API library
+/* Extracted from gdrapi.h (v2.1 Nov 2020) */
+#define GPU_PAGE_SHIFT   16
+#define GPU_PAGE_SIZE    (1UL << GPU_PAGE_SHIFT)
+#define GPU_PAGE_OFFSET  (GPU_PAGE_SIZE-1)
+#define GPU_PAGE_MASK    (~GPU_PAGE_OFFSET)
+struct gdr;
+typedef struct gdr *gdr_t;
+typedef struct gdr_mh_s {
+  unsigned long h;
+} gdr_mh_t;
+struct gdr_info {
+    uint64_t va;
+    uint64_t mapped_size;
+    uint32_t page_size;
+    uint64_t tm_cycles;
+    uint32_t cycles_per_ms;
+    unsigned mapped:1;
+    unsigned wc_mapping:1;
+};
+typedef struct gdr_info gdr_info_t;
+/* End of gdrapi.h */
+ncclResult_t wrap_gdr_symbols(void);
+gdr_t wrap_gdr_open(void);
+ncclResult_t wrap_gdr_close(gdr_t g);
+ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle);
+ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle);
+ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info);
+ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size);
+ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size);
+ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor);
+ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor);
+ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size);
+ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size);
+#endif // GDR_DIRECT
+// Global GDR driver handle
+extern gdr_t ncclGdrCopy;
+#include "alloc.h"
+typedef struct gdr_mem_desc {
+  void *gdrDevMem;
+  void *gdrMap;
+  size_t gdrOffset;
+  size_t gdrMapSize;
+  gdr_mh_t gdrMh;
+} gdr_mem_desc_t;
+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+static gdr_t ncclGdrInit() {
+  INFO(NCCL_INIT, "Enabled GDRCopy equivalent memory allocation");
+  return (gdr_t)0x12345678L;
+}
+template <typename T>
+static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle, hipStream_t stream) {
+  gdr_info_t info;
+  size_t mapSize;
+  gdr_mh_t mh;
+  char *devMem;
+  void *gdrMap;
+  mapSize = sizeof(T)*nelem;
+  // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
+  ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
+  // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
+  NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, true));
+  gdr_mem_desc_t* md;
+  NCCLCHECK(ncclCalloc(&md, 1));
+  md->gdrDevMem = devMem;
+  md->gdrMap = NULL;
+  md->gdrMapSize = mapSize;
+  md->gdrOffset = 0;
+  md->gdrMh.h = 0;
+  *gdrHandle = md;
+  *ptr = (T *)(devMem);
+  if (devPtr) *devPtr = (T *)(devMem);
+  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
+       md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
+  return ncclSuccess;
+}
+template <typename T>
+static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  memcpy(dst, src, nelem*sizeof(T));
+  return ncclSuccess;
+}
+static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  CUDACHECK(hipFree(md->gdrDevMem));
+  free(md);
+  return ncclSuccess;
+}
+#else
+static gdr_t ncclGdrInit() {
+  int libMajor, libMinor, drvMajor, drvMinor;
+  gdr_t handle = NULL;
+  // Dynamically load the GDRAPI library symbols
+  if (wrap_gdr_symbols() == ncclSuccess) {
+    handle = wrap_gdr_open();
+    if (handle != NULL) {
+      ncclResult_t res;
+      // Query the version of libgdrapi
+      NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error);
+      // Query the version of gdrdrv driver
+      NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error);
+      // Only support GDRAPI 2.1 and later
+      if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) {
+        goto error;
+      }
+      else
+        INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor);
+    }
+  }
+  return handle;
+error:
+  if (handle != NULL) (void) wrap_gdr_close(handle);
+  return NULL;
+}
+template <typename T>
+static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) {
+  gdr_info_t info;
+  size_t mapSize;
+  gdr_mh_t mh;
+  char *devMem;
+  void *gdrMap;
+  mapSize = sizeof(T)*nelem;
+  // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
+  ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
+  // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
+  NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1));
+  uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK;
+  size_t align = alignedAddr - (uint64_t)devMem;
+  //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize);
+  NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh));
+  NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize));
+  //TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap);
+  NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info));
+  // Will offset ever be non zero ?
+  ssize_t off = info.va - alignedAddr;
+  gdr_mem_desc_t* md;
+  NCCLCHECK(ncclCalloc(&md, 1));
+  md->gdrDevMem = devMem;
+  md->gdrMap = gdrMap;
+  md->gdrMapSize = mapSize;
+  md->gdrOffset = off+align;
+  md->gdrMh = mh;
+  *gdrHandle = md;
+  *ptr = (T *)((char *)gdrMap+off);
+  if (devPtr) *devPtr = (T *)(devMem+off+align);
+  TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p",
+       md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr);
+  return ncclSuccess;
+}
+template <typename T>
+static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T)));
+  return ncclSuccess;
+}
+static ncclResult_t ncclGdrCudaFree(void* gdrHandle) {
+  gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle;
+  NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize));
+  NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh));
+  NCCLCHECK(ncclCudaFree(md->gdrDevMem));
+  free(md);
+  return ncclSuccess;
+}
+#endif
+#endif // End include guard
--- a/rccl/src/include/git_version.h
+++ b/rccl/src/include/git_version.h
+/*************************************************************************
+ * Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef RCCL_GIT_VERSION_H_
+#define RCCL_GIT_VERSION_H_
+extern const char *rcclGitHash;
+#endif
--- a/rccl/src/include/graph.h
+++ b/rccl/src/include/graph.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_GRAPH_H_
+#define NCCL_GRAPH_H_
+#include "nccl.h"
+#include "devcomm.h"
+#include <limits.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <sched.h>
+ncclResult_t ncclTopoCudaPath(int cudaDev, char** path);
+struct ncclTopoSystem;
+// Build the topology
+ncclResult_t ncclTopoGetSystem(struct ncclComm* comm, struct ncclTopoSystem** system);
+ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoPrint(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm);
+void ncclTopoFree(struct ncclTopoSystem* system);
+ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm);
+ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm);
+ncclResult_t ncclTopoGetNvbGpus(struct ncclTopoSystem* system, int rank, int* nranks, int** ranks);
+int ncclTopoPathAllNVLink(struct ncclTopoSystem* system);
+// Query topology
+ncclResult_t ncclTopoGetNetDev(struct ncclComm* comm, int rank, struct ncclTopoGraph* graph, int channelId, int peerRank, int* net, int* proxyRank);
+ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank);
+ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr);
+#define MAX_XGMI_INTER_GPUS 4
+ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int type, int* dev);
+ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter=MAX_XGMI_INTER_GPUS, int nInter=0, int *inter=nullptr);
+ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int* flush);
+ncclResult_t ncclTopoCheckNet(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* net);
+int ncclPxnDisable(struct ncclComm* comm);
+ncclResult_t ncclTopoGetPxnRanks(struct ncclComm* comm, int** intermediateRanks, int* nranks);
+ncclResult_t ncclTopoGetLocalRank(struct ncclTopoSystem* system, int rank, int* localRank);
+// Find CPU affinity
+ncclResult_t ncclTopoGetCpuAffinity(struct ncclTopoSystem* system, int rank, cpu_set_t* affinity);
+#define NCCL_TOPO_CPU_ARCH_X86 1
+#define NCCL_TOPO_CPU_ARCH_POWER 2
+#define NCCL_TOPO_CPU_ARCH_ARM 3
+#define NCCL_TOPO_CPU_VENDOR_INTEL 1
+#define NCCL_TOPO_CPU_VENDOR_AMD 2
+#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
+#define NCCL_TOPO_CPU_TYPE_BDW 1
+#define NCCL_TOPO_CPU_TYPE_SKL 2
+#define NCCL_TOPO_CPU_TYPE_ZEN 3
+#define NCCL_TOPO_CPU_TYPE_ROME 4
+#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
+ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
+ncclResult_t ncclTopoGetGpuCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetNvsCount(struct ncclTopoSystem* system, int* count);
+ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int channelId, int* id);
+ncclResult_t ncclTopoGetLocalGpu(struct ncclTopoSystem* system, int net, int* gpuIndex);
+#define NCCL_TOPO_MAX_NODES 256
+// Init search. Needs to be done before calling ncclTopoCompute
+ncclResult_t ncclTopoSearchInit(struct ncclTopoSystem* system);
+#define NCCL_TOPO_PATTERN_BALANCED_TREE 1   // Spread NIC traffic between two GPUs (Tree parent + one child on first GPU, second child on second GPU)
+#define NCCL_TOPO_PATTERN_SPLIT_TREE 2      // Spread NIC traffic between two GPUs (Tree parent on first GPU, tree children on the second GPU)
+#define NCCL_TOPO_PATTERN_TREE 3            // All NIC traffic going to/from the same GPU
+#define NCCL_TOPO_PATTERN_RING 4            // Ring
+#define NCCL_TOPO_PATTERN_NVLS 5            // NVLS+SHARP and NVLS+Tree
+struct ncclTopoGraph {
+  // Input / output
+  int id; // ring : 0, tree : 1, collnet : 2
+  int pattern;
+  int crossNic;
+  int collNet;
+  int minChannels;
+  int maxChannels;
+  // Output
+  int nChannels;
+  float bwIntra;
+  float bwInter;
+  float latencyInter;
+  int typeIntra;
+  int typeInter;
+  int sameChannels;
+  int nHops;
+  int intra[MAXCHANNELS*NCCL_TOPO_MAX_NODES];
+  int inter[MAXCHANNELS*2];
+  int nIntraChannels;
+  int intraNets[MAXCHANNELS*NCCL_TOPO_MAX_NODES*2];
+  char treeBase[NCCL_TOPO_MAX_NODES][NCCL_TOPO_MAX_NODES*4];
+};
+ncclResult_t ncclTopoCompute(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoPrintGraph(struct ncclTopoSystem* system, struct ncclTopoGraph* graph);
+ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, struct ncclTopoGraph** graphs);
+struct ncclTopoRanks {
+  int ringRecv[MAXCHANNELS];
+  int ringSend[MAXCHANNELS];
+  int ringPrev[MAXCHANNELS];
+  int ringNext[MAXCHANNELS];
+  int treeToParent[MAXCHANNELS];
+  int treeToChild0[MAXCHANNELS];
+  int treeToChild1[MAXCHANNELS];
+  int nvlsHeads[MAXCHANNELS];
+};
+ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks);
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns,
+    struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, int nc);
+ncclResult_t ncclTreeBasePostset(struct ncclComm* comm, struct ncclTopoGraph* treeGraph);
+ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs);
+#include "info.h"
+ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time);
+#endif
--- a/rccl/src/include/group.h
+++ b/rccl/src/include/group.h
+/*************************************************************************
+ * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_GROUP_H_
+#define NCCL_GROUP_H_
+#include "nccl.h"
+#include "comm.h"
+ncclResult_t ncclGroupErrCheck(ncclResult_t ret);
+void ncclGroupCommJoin(struct ncclComm* comm);
+void ncclGroupCommPreconnect(struct ncclComm* comm);
+ncclResult_t ncclGroupCommLeave(struct ncclComm* comm);
+void ncclGroupJobAbort();
+typedef ncclResult_t(*ncclInitFunc_t)(ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
+ncclResult_t ncclAsyncInit(ncclInitFunc_t func, ncclComm_t* newcomm, int ndev, ncclUniqueId commId, int myrank, int cudaDev);
+typedef enum ncclGroupJobState {
+  ncclGroupJobRunning = 0,
+  ncclGroupJobDone    = 1,
+  ncclGroupJobJoined  = 2,
+} ncclGroupJobState_t;
+struct ncclAsyncJob {
+  struct ncclAsyncJob* next;
+  pthread_t thread;
+  ncclResult_t result;
+  ncclResult_t(*func)(struct ncclAsyncJob*);
+  void(*undo)(struct ncclAsyncJob*);
+  void(*destructor)(void*);
+  ncclGroupJobState_t state;
+  volatile uint32_t *abortFlag; /* point to comm abortFlag */
+  volatile uint32_t *childAbortFlag; /* point to child abortFlag */
+  ncclComm_t comm;
+};
+ncclResult_t ncclAsyncLaunch(
+  struct ncclAsyncJob* job,
+  ncclResult_t(*func)(struct ncclAsyncJob*),
+  void(*undo)(struct ncclAsyncJob*),
+  void(*destructor)(void*), ncclComm_t comm
+);
+struct ncclGroupJob {
+  struct ncclAsyncJob base;
+  struct ncclComm **groupCommHeadPtr;
+  struct ncclComm **groupCommPreconnectHeadPtr;
+  ncclResult_t *groupErrorPtr;
+  volatile bool *abortFlagPtr;
+  struct ncclIntruQueue<struct ncclAsyncJob, &ncclAsyncJob::next> *asyncJobsPtr;
+  bool doneFlag;
+};
+ncclResult_t ncclGroupStartInternal();
+ncclResult_t ncclGroupEndInternal();
+ncclResult_t ncclAsyncJobComplete(struct ncclAsyncJob* job);
+////////////////////////////////////////////////////////////////////////////////
+extern __thread int ncclGroupDepth; // depth of ncclGroupStart nesting
+extern __thread ncclResult_t ncclGroupError;
+extern __thread struct ncclComm* ncclGroupCommHead;
+extern __thread struct ncclComm* ncclGroupCommPreconnectHead;
+extern __thread int ncclGroupBlocking;
+extern __thread struct ncclGroupJob *ncclGroupJobMainPtr;
+extern __thread struct ncclGroupJob ncclGroupJobMain;
+static inline void groupResetJobState() {
+  ncclGroupBlocking = -1;
+  ncclGroupJobMainPtr = NULL;
+  memset(&ncclGroupJobMain, 0, sizeof(struct ncclGroupJob));
+  return;
+}
+static inline ncclResult_t groupJobComplete(struct ncclGroupJob* job) {
+  ncclResult_t ret = ncclSuccess;
+  if (job) {
+    ret = ncclAsyncJobComplete(&job->base);
+    groupResetJobState();
+  }
+  return ret;
+}
+inline ncclResult_t ncclGroupErrCheck(ncclResult_t ret) {
+  if (ncclGroupDepth > 0) {
+    if (ret != ncclSuccess && ret != ncclInProgress) ncclGroupError = ret;
+  }
+  return ret;
+}
+// Add comm to this thread's group
+inline void ncclGroupCommJoin(struct ncclComm* comm) {
+  if (comm->groupNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    // Insert comm into ncclGroupCommHead adjacent to sibling comms. This preserves
+    // the users program order yet insures siblings occur consecutively. This
+    // is required by doLaunches() in "group.cc".
+    struct ncclComm** pp = &ncclGroupCommHead;
+    while (*pp != nullptr && comm->intraComm0 != (*pp)->intraComm0)
+      pp = &(*pp)->groupNext;
+    comm->groupNext = *pp;
+    *pp = comm;
+    // Comms gets a new memory stack scope upon joining. Each task batched for
+    // this comm is allocated there.
+    ncclMemoryStackPush(&comm->memScoped);
+  }
+  ncclGroupBlocking = comm->config.blocking;
+}
+// Add comm to this thread's group needing preconnect
+inline void ncclGroupCommPreconnect(struct ncclComm* comm) {
+  if (comm->preconnectNext == reinterpret_cast<struct ncclComm*>(0x1)) {
+    comm->preconnectNext = ncclGroupCommPreconnectHead;
+    ncclGroupCommPreconnectHead = comm;
+  }
+}
+// Comm has left group
+inline ncclResult_t ncclGroupCommLeave(struct ncclComm* comm) {
+  comm->groupNext = reinterpret_cast<struct ncclComm*>(0x1);
+  ncclMemoryStackPop(&comm->memScoped);
+  return ncclSuccess;
+}
+#endif
--- a/rccl/src/include/hipprof/hip_prof_rccl.h
+++ b/rccl/src/include/hipprof/hip_prof_rccl.h
+/* Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+#ifndef HIP_SRC_HIP_PROF_RCCL_H
+#define HIP_SRC_HIP_PROF_RCCL_H
+#include "hipprof/hip_prof_rccl_str.h"
+#include "hipprof/hip_profile_common.h"
+typedef prof_error_t (*PFN_rccl_prof_api_enter)(uint32_t cid, void *api_entry);
+typedef prof_error_t (*PFN_rccl_prof_api_exit)(uint32_t cid, void *api_entry);
+extern PFN_rccl_prof_api_enter pfn_rccl_prof_api_enter;  
+extern PFN_rccl_prof_api_exit pfn_rccl_prof_api_exit;  
+void init_rccl_prof_fns(); 
+#endif  // HIP_SRC_HIP_PROF_RCCL_H
--- a/rccl/src/include/hipprof/hip_prof_rccl_api.h
+++ b/rccl/src/include/hipprof/hip_prof_rccl_api.h
+/* Copyright (c) 2019 - 2021 Advanced Micro Devices, Inc.
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE. */
+#ifndef HIP_SRC_HIP_PROF_RCCL_API_H
+#define HIP_SRC_HIP_PROF_RCCL_API_H
+#include <atomic>
+#include <cassert>
+#include <iostream>
+#include <shared_mutex>
+#include <utility>
+#include "info.h"
+#include "debug.h"
+#include <dlfcn.h>
+#include <sys/utsname.h>
+#include <fstream>
+#include <sys/shm.h>
+#include "hipprof/hip_prof_rccl.h"
+#include "hipprof/hip_prof_rccl_param.h"
+#define RCCL_CB_SPAWNER_OBJECT(operation_id, info_ptr) rccl_cb_spawner_object<RCCL_API_ID_##operation_id> __api_tracer(info_ptr); 
+template <rccl_api_id_t operation_id> class rccl_cb_spawner_object {  
+public:  
+   rccl_cb_spawner_object(ncclInfo* info_ptr): info(info_ptr), entry(nullptr), stat(STATUS_ERROR), correlation_id(0) {
+        if (ncclParamHipProf() == 1) {
+            static_assert(operation_id >= RCCL_API_ID_FIRST && operation_id <= RCCL_API_ID_LAST, "invalid RCCL_API operation id"); 
+            init_rccl_prof_fns();   
+            entry = std::make_unique<hip_prof_rccl_entry>();  
+            entry->kind = RCCL_KIND_ID_API;
+            entry->ret_stat = 0;
+            entry->cid = operation_id;
+            entry->sendbuff = info->sendbuff;
+            entry->recvbuff = info->recvbuff;
+            entry->count = info->count;
+            entry->datatype = info->datatype;
+            entry->op = info->op;
+            entry->rid = info->root; 
+            if (pfn_rccl_prof_api_enter != nullptr) {
+                stat = pfn_rccl_prof_api_enter(operation_id, entry.get());  
+                if (stat != STATUS_SUCCESS) {
+                    INFO(NCCL_INIT, "stat: %d, Failed to add rccl_prof_api_enter.", stat);
+                    entry.reset();
+                } else {
+                    correlation_id = entry->correlation_id;
+                }      
+            }        
+        } 
+    }
+    activity_correlation_id_t getCorrelationId() const {  
+        return correlation_id;  
+    }
+    ~rccl_cb_spawner_object() { 
+        if (stat == STATUS_SUCCESS && entry) {        
+            entry->nBytes = info->nBytes;
+            if (pfn_rccl_prof_api_exit != nullptr) {
+                stat = pfn_rccl_prof_api_exit(operation_id, entry.get());
+                if (stat != STATUS_SUCCESS) {
+                    INFO(NCCL_INIT, "Failed to add rccl_prof_api_exit.");
+                }
+            }
+            correlation_id = 0;
+        }
+    }  
+private:
+    prof_error_t stat;
+    std::unique_ptr<hip_prof_rccl_entry> entry;  
+    activity_correlation_id_t correlation_id;
+    ncclInfo* info;
+};
+#endif  // HIP_SRC_HIP_PROF_RCCL_API_H
\ No newline at end of file
--- a/rccl/src/include/hipprof/hip_prof_rccl_param.h
+++ b/rccl/src/include/hipprof/hip_prof_rccl_param.h
+#ifndef HIP_SRC_HIP_PROF_RCCL_PARAM_H
+#define HIP_SRC_HIP_PROF_RCCL_PARAM_H
+#include <stdint.h>
+void ncclLoadHipProfParam(char const* env, int64_t deftVal, int64_t uninitialized, int64_t* cache);
+int64_t ncclParamHipProf();
+#endif // HIP_SRC_HIP_PROF_RCCL_PARAM_H
\ No newline at end of file
--- a/rccl/src/include/hipprof/hip_prof_rccl_str.h
+++ b/rccl/src/include/hipprof/hip_prof_rccl_str.h
+// Generated file. DO NOT EDIT.
+//
+// This file is automatically generated by the rccl_gen.py script.
+// If changes are required, run the script and commit the updated file.
+#ifndef _HIP_PROF_RCCL_STR_H
+#define _HIP_PROF_RCCL_STR_H
+#define HIP_PROF_RCCL_VER 1
+// RCCL API callbacks ID enumeration
+enum rccl_api_id_t {
+  RCCL_API_ID_NONE = 0,
+  RCCL_API_ID_FIRST = 1,
+  RCCL_API_ID_mscclLoadAlgo = 1,
+  RCCL_API_ID_mscclRunAlgo = 2,
+  RCCL_API_ID_mscclUnloadAlgo = 3,
+  RCCL_API_ID_ncclAllGather = 4,
+  RCCL_API_ID_ncclAllReduce = 5,
+  RCCL_API_ID_ncclAllToAll = 6,
+  RCCL_API_ID_ncclAllToAllv = 7,
+  RCCL_API_ID_ncclBcast = 8,
+  RCCL_API_ID_ncclBroadcast = 9,
+  RCCL_API_ID_ncclGather = 10,
+  RCCL_API_ID_ncclRecv = 11,
+  RCCL_API_ID_ncclReduce = 12,
+  RCCL_API_ID_ncclReduceScatter = 13,
+  RCCL_API_ID_ncclScatter = 14,
+  RCCL_API_ID_ncclSend = 15,
+  RCCL_API_ID_LAST = 15,
+};
+// Return the RCCL API string for a given callback ID
+static inline const char* rccl_api_name(const uint32_t id) {
+  switch(id) {
+    case RCCL_API_ID_mscclLoadAlgo: return "mscclLoadAlgo";
+    case RCCL_API_ID_mscclRunAlgo: return "mscclRunAlgo";
+    case RCCL_API_ID_mscclUnloadAlgo: return "mscclUnloadAlgo";
+    case RCCL_API_ID_ncclAllGather: return "ncclAllGather";
+    case RCCL_API_ID_ncclAllReduce: return "ncclAllReduce";
+    case RCCL_API_ID_ncclAllToAll: return "ncclAllToAll";
+    case RCCL_API_ID_ncclAllToAllv: return "ncclAllToAllv";
+    case RCCL_API_ID_ncclBcast: return "ncclBcast";
+    case RCCL_API_ID_ncclBroadcast: return "ncclBroadcast";
+    case RCCL_API_ID_ncclGather: return "ncclGather";
+    case RCCL_API_ID_ncclRecv: return "ncclRecv";
+    case RCCL_API_ID_ncclReduce: return "ncclReduce";
+    case RCCL_API_ID_ncclReduceScatter: return "ncclReduceScatter";
+    case RCCL_API_ID_ncclScatter: return "ncclScatter";
+    case RCCL_API_ID_ncclSend: return "ncclSend";
+  };
+  return "unknown";
+}
+#include <string.h>
+// Return the RCCL API callback ID for a given name
+static inline uint32_t rcclApiIdByName(const char* name) {
+  if (strcmp("mscclLoadAlgo", name) == 0) return RCCL_API_ID_mscclLoadAlgo;
+  if (strcmp("mscclRunAlgo", name) == 0) return RCCL_API_ID_mscclRunAlgo;
+  if (strcmp("mscclUnloadAlgo", name) == 0) return RCCL_API_ID_mscclUnloadAlgo;
+  if (strcmp("ncclAllGather", name) == 0) return RCCL_API_ID_ncclAllGather;
+  if (strcmp("ncclAllReduce", name) == 0) return RCCL_API_ID_ncclAllReduce;
+  if (strcmp("ncclAllToAll", name) == 0) return RCCL_API_ID_ncclAllToAll;
+  if (strcmp("ncclAllToAllv", name) == 0) return RCCL_API_ID_ncclAllToAllv;
+  if (strcmp("ncclBcast", name) == 0) return RCCL_API_ID_ncclBcast;
+  if (strcmp("ncclBroadcast", name) == 0) return RCCL_API_ID_ncclBroadcast;
+  if (strcmp("ncclGather", name) == 0) return RCCL_API_ID_ncclGather;
+  if (strcmp("ncclRecv", name) == 0) return RCCL_API_ID_ncclRecv;
+  if (strcmp("ncclReduce", name) == 0) return RCCL_API_ID_ncclReduce;
+  if (strcmp("ncclReduceScatter", name) == 0) return RCCL_API_ID_ncclReduceScatter;
+  if (strcmp("ncclScatter", name) == 0) return RCCL_API_ID_ncclScatter;
+  if (strcmp("ncclSend", name) == 0) return RCCL_API_ID_ncclSend;
+  return RCCL_API_ID_NONE;
+}
+#endif  // _HIP_PROF_RCCL_STR_H