Initial commit: RCCL auto-tuning project

7dc4e964 · wanghan · 7dc4e964 · 7dc4e964 · 7dc4e964 · 7dc4e964
Commit 7dc4e964 authored Apr 02, 2026 by wanghan
20 changed files
--- a/rccl-test/verifiable/inexact_regress.cu
+++ b/rccl-test/verifiable/inexact_regress.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/* Generate parameters for our error bound model of floating point average
+ * (sum of scaled values) by sampling sums of random sequences for each
+ * floating point type.
+ *
+ * The model has parameters "coef" and "power", where for two floats a & b,
+ * they are close enough if and only if:
+ *   abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
+ *
+ * Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
+ *
+ * Compile with:
+ *   nvcc -gencode=arch=compute_80,code=sm_80
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
+
+using std::uint64_t;
+using std::uint32_t;
+using bfloat16 = hip_bfloat16;
+
+template<typename T>
+struct float_traits;
+
+template<>
+struct float_traits<float> {
+  static constexpr int mantissa_bits = 23;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint32_t;
+  __device__ static float make(double x) { return (float)x; }
+  __device__ static float make(uint64_t x) { return (float)x; }
+  __device__ static double todouble(float x) { return x; }
+  __device__ static float add(float a, float b) { return a+b; }
+  __device__ static float mul(float a, float b) { return a*b; }
+};
+template<>
+struct float_traits<double> {
+  static constexpr int mantissa_bits = 52;
+  static constexpr int exponent_bits = 11;
+  using uint_t = uint64_t;
+  __device__ static double make(double x) { return x; }
+  __device__ static double make(uint64_t x) { return (double)x; }
+  __device__ static double todouble(double x) { return x; }
+  __device__ static double add(double a, double b) { return a+b; }
+  __device__ static double mul(double a, double b) { return a*b; }
+};
+template<>
+struct float_traits<__half> {
+  static constexpr int mantissa_bits = 10;
+  static constexpr int exponent_bits = 5;
+  using uint_t = uint16_t;
+  __device__ static __half make(double x) { return __float2half((float)x); }
+  __device__ static __half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(__half x) { return __half2float(x); }
+  __device__ static __half add(__half a, __half b) { return __hadd(a, b); }
+  __device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
+};
+template<>
+struct float_traits<bfloat16> {
+  static constexpr int mantissa_bits = 7;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint16_t;
+  __device__ static bfloat16 make(double x) { return bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
+  __device__ static double todouble(bfloat16 x) { return double(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
+};
+
+template<typename F>
+__device__ int compare(F a, F b) {
+  union { typename float_traits<F>::uint_t ua; F fa; };
+  union { typename float_traits<F>::uint_t ub; F fb; };
+  ua=0; ub=0;
+  fa=a; fb=b;
+  //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
+  return ua < ub ? ub-ua : ua-ub;
+}
+
+struct xoshiro256ss {
+	uint64_t s[4];
+  __device__ xoshiro256ss(int seed) {
+    constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
+    for(int i=0; i < 4; i++)
+      s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
+  }
+  __device__ uint64_t operator()() {
+    auto rol64 = [](uint64_t x, int k) {
+      return (x << k) | (x >> (64 - k));
+    };
+    uint64_t const result = rol64(s[1] * 5, 7) * 9;
+    uint64_t const t = s[1] << 17;
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+    s[2] ^= t;
+    s[3] = rol64(s[3], 45);
+    return result;
+  }
+};
+
+static __device__ int __reduce_max_sync(unsigned int mask, int value)
+{
+  //We ignore mask, since all bits are set when calling them in the
+  //test code below.
+  int width = warpSize;
+  for (unsigned int i = warpSize; i; i >>= 1) {
+    value = max(__shfl_down(value, i, width), value);
+  }
+  return value;
+}
+
+template<typename F>
+__global__ void kernel() {
+  using traits = float_traits<F>;
+  constexpr int samps = 4<<10;
+  __shared__ F accf[samps];
+  __shared__ double accd[samps];
+
+  xoshiro256ss rng(threadIdx.x);
+  float expo_avg = 1;
+  for(int pass=0; pass < 2; pass++) {
+    F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
+    int err_max = 0;
+    float coef = 0;
+    double expo_sum = 0;
+    int expo_n = 0;
+    int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
+    for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
+    //for(int round=0; round < 2; round++) {
+      for(int i=threadIdx.x; i < samps; i += blockDim.x) {
+        accf[i] = (F)0;
+        accd[i] = 0;
+      }
+      __syncthreads();
+      for(int r=0; r < max_ranks; r++) {
+        int err = 0;
+        for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
+          constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
+          double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
+          F f = traits::make(d);
+          accf[i] = traits::add(accf[i], traits::mul(scalar, f));
+          accd[i] += traits::todouble(f);
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
+          int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
+          err = err > e ? err : e;
+        }
+        err = __reduce_max_sync(-1u, err);
+        err_max = err_max > err ? err_max : err;
+        if (r >= 2) {
+          // err = 1 + coef*pow(r,expo)
+          float c = float(err-1)/powf(float(r), expo_avg);
+          coef = coef > c ? coef : c;
+        }
+        if (r >= 2) {
+          double expo = log2f(1+err_max)/log2f(r);
+          expo_sum += expo;
+          expo_n++;
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
+        }
+      }
+    }
+    if(pass==0)
+      expo_avg = expo_sum/expo_n;
+    else if(threadIdx.x == 0)
+      printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+  }
+}
+
+int main() {
+  std::printf("type=float:\n");
+  kernel<float><<<1,32>>>();
+  hipDeviceSynchronize();
+
+  std::printf("\ntype=half:\n");
+  kernel<half><<<1,32>>>();
+  hipDeviceSynchronize();
+
+  std::printf("\ntype=bfloat16:\n");
+  kernel<bfloat16><<<1,32>>>();
+  hipDeviceSynchronize();
+  return 0;
+}
--- a/rccl-test/verifiable/verifiable.cu
+++ b/rccl-test/verifiable/verifiable.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+//#pragma nv_diag_suppress declared_but_not_referenced
+
+#include "verifiable.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+
+#include "rccl/rccl.h"
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
+  #define HAVE_ncclBfloat16 1
+#else
+  #define HAVE_ncclBfloat16 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define HAVE_ncclAvg 1
+#else
+  #define HAVE_ncclAvg 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define HAVE_ncclPreMulSum 1
+#else
+  #define HAVE_ncclPreMulSum 0
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <unistd.h>
+
+using std::size_t;
+using std::int8_t;
+using std::int16_t;
+using std::int32_t;
+using std::int64_t;
+using std::uint8_t;
+using std::uint16_t;
+using std::uint32_t;
+using std::uint64_t;
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+__device__ unsigned long long bitsOf(T x) {
+  union { unsigned long long ull; T val; } u;
+  u.ull = 0;
+  u.val = x;
+  return u.ull;
+}
+
+__host__ __device__ uint64_t mixBits(uint64_t x) {
+  union { uint32_t u32[2]; uint64_t u64; };
+  u64 = x;
+  u32[1] += 1;
+  u32[0] ^= u32[1];
+  u64 *= 0x9e3779b97f4a7c13u;
+  u32[0] ^= u32[1]<<16 ^ u32[1]>>16;
+  return u64;
+}
+
+__host__ __device__ uint64_t hashOf(uint64_t a, uint64_t b=0) {
+  a += uint64_t(1)<<32;
+  a += b;
+  a ^= a>>32;
+  a *= 0x9e3779b97f4a7c13u;
+  a += b>>16 ^ b<<48;
+  a ^= a>>32;
+  a *= 0xc4ceb9fe1a85ec53u;
+  return a;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct IsIntegral: std::is_integral<T> {};
+template<>
+struct IsIntegral<__half>: std::false_type {};
+#if RCCL_BFLOAT16 == 1
+template<>
+struct IsIntegral<hip_bfloat16>: std::false_type {};
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Hide a value from arithmetic optimizations. Hopefully compiler cannot detect
+// that this is equivalent to the identity function.
+template<typename T>
+__host__ __device__ T inhibit(T x) {
+  union { uint64_t u64; T val; };
+  u64 = 0;
+  val = x;
+  u64 *= 0x0000000100000001u;
+  u64 *= 0xffffffff00000001u;
+  return val;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+  template<typename Y, typename X>
+  __host__ __device__ Y castTo(X x) {
+    return Y(x);
+  }
+  template<typename Y>
+  __host__ __device__ Y castTo(float x) {
+    return Y(x);
+  }
+  template<>
+  __host__ __device__ __half castTo<__half>(float x) {
+    return __float2half(x);
+  }
+  #if RCCL_BFLOAT16 == 1
+  template<>
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
+    return hip_bfloat16(x);
+  }
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The reduction functions
+
+namespace {
+struct ReduceNil {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T /*b*/) const { return a; }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()+T())>
+  __host__ __device__ T operator()(T a, T b) const { return a + b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+      return __float2half(__half2float(a) + __half2float(b));
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceProd {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()*T())>
+  __host__ __device__ T operator()(T a, T b) const { return a * b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+      return __float2half(__half2float(a) * __half2float(b));
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMin {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()<T())>
+  __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+    return __half2float(a) < __half2float(b) ? a : b;
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) < static_cast<float>(b) ? a : b;
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMax {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()>T())>
+  __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+      return __half2float(a) > __half2float(b) ? a : b;
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) > static_cast<float>(b) ? a : b;
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReducePreMulSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceProd()(x, ncclVerifiablePremulScalar<T>(rank_me));
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+template<typename T, bool integral = IsIntegral<T>::value>
+struct ReduceAvg_Base;
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/true> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x/rank_n; }
+};
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/false> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const {
+    using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+    return ReduceProd()(inhibit(castTo<T>(T1(1)/T1(rank_n))), inhibit(x));
+  }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+struct ReduceAvg {
+  int rank_n;
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceAvg_Base<T>{rank_n}.preOp(x, rank_me);
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const {
+    return ReduceAvg_Base<T>{rank_n}(a, b);
+  }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const {
+    return ReduceAvg_Base<T>{rank_n}.postOp(x);
+  }
+};
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct FloatLayout;
+template<>
+struct FloatLayout<float> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 23;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<double> {
+  static constexpr int exponent_bits = 11, mantissa_bits = 52;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<__half> {
+  static constexpr int exponent_bits = 5, mantissa_bits = 10;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#if RCCL_BFLOAT16 == 1
+template<>
+struct FloatLayout<hip_bfloat16> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 7;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#endif
+
+template<typename T>
+__host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
+  union { T ans; uint64_t bits; };
+  bits = sign;
+  bits <<= FloatLayout<T>::exponent_bits;
+  bits |= exp;
+  bits <<= FloatLayout<T>::mantissa_bits;
+  bits |= mant;
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// High bits of multiplcation are useful for generating bounded random values
+// from unbounded random values. For instance, given X a totally random 32-bit
+// integer, `umul32hi(X,n)` will be totally random within [0,n).
+__host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
+#if HIP_VERSION > 50200000
+  return __umulhi(a, b);
+#else
+  return uint64_t(a)*b >> 32;
+#endif
+}
+__host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
+#if HIP_VERSION > 50200000
+  return __umul64hi(a, b);
+#else
+  return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
+#endif
+}
+
+__host__ __device__ int clz32(int x) {
+#if HIP_VERSION > 50200000
+  return __clz(x);
+#else
+  return x==0 ? 32 : __builtin_clz(x);
+#endif
+}
+__host__ __device__ int clz64(long long x) {
+#if HIP_VERSION > 50200000
+  return __clzll(x);
+#else
+  return x==0 ? 64 : __builtin_clzll(x);
+#endif
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// Returns a wildly permuted rank index. Useful when we know we want exactly N
+// random ranks to exhibit some behavior, we can just test if:
+// `shuffleRank(rank_n, rank_me, rng) < N`. Note that rank_n > 0 must be true
+// for well defined results. This mixes the bits of rng.
+__host__ __device__ int shuffleRank(int rank_n, int rank_me, uint64_t &rng) {
+  uint32_t a = uint32_t(rng);
+  uint32_t b = uint32_t(rng>>32);
+  rng = mixBits(rng);
+
+  uint32_t r = rank_me;
+  // round down rank_n to largest pow2, then subtract 1
+  uint32_t n2 = (~uint32_t(0)>>1) >> clz32(rank_n);
+
+  // These are 1:1 functions modulo 2^n:
+  //   f(x) = x*a + b : for odd a, any b
+  //   f(x) = (x*x + x)/2
+  // So we apply both to the bottom n2+1 ranks, then rotate the top
+  // (rank_n-n2-1) to the bottom and apply both again.
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 ranks
+    r = (r*(a|1) + b) & n2;
+    r = (r*r + r)/2 & n2;
+    // rotate top to bottom
+    r += rank_n - (n2+1);
+  }
+  else
+    r -= n2+1; // rotate top to bottom
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 again
+    r = (r*(b|1) + a) & n2;
+    r = (r*r + r)/2 & n2;
+  }
+  return r;
+}
+}
+
+namespace {
+// Generate wild integers x and y such that if every rank submits its x into a
+// summation the result will be y with y <= y_max. Ranks should be shuffled
+// before calling.
+template<typename Uint>
+__host__ __device__ void genSumXY(
+    int rank_n, int rank_me, uint64_t &rng, Uint y_max, Uint &x, Uint &y,
+    bool avoid_y=false // if true then returned y will not equal given y
+  ) {
+  static_assert(std::is_unsigned<Uint>::value, "Type must be unsigned integral.");
+
+  { // Pick y as a random value in [y_max/2, y_max]
+    Uint d, y_min = (y_max+1)/2;
+    if(8*sizeof(Uint) > 32)
+      d = umul64hi(rng, y_max/2 + (avoid_y ? 0 : 1));
+    else
+      d = umul32hi(uint32_t(rng), y_max/2 + (avoid_y ? 0 : 1));
+    Uint y1 = (avoid_y ? y+1 : y_min) + d;
+    y = y1 - (avoid_y && (y1 < y_min || y_max < y1) ? y_max/2 : 0);
+  }
+  rng = mixBits(rng);
+
+  unsigned r = unsigned(rank_me);
+  unsigned rn = unsigned(rank_n);
+  // Partition our rn ranks into pn distinct subsets each of size rn/pn. If each
+  // rank submits 1+p (where p is 0-based partition index) then the sum be:
+  //   (rn/pn) * pn*(pn+1)/2
+  // So set this equal to our desired sum y and solve for pn.
+  //   (rn/pn) * pn*(pn+1)/2 = y
+  //   rn*(pn+1)/2 = y
+  //   pn = 2*(y/rn)-1
+  Uint pn = rn == 1 ? 1 : 2*(y/rn) - 1;
+  // In the case where rn is huge (compared to y) use only one partition meaning
+  // that all rn ranks will submit 1 (since p=0).
+  pn = pn == 0 ? 1 : pn;
+  // Can't have more partitions than ranks.
+  pn = rn < pn ? rn : pn;
+  // Compute sum of contribution from pn partitions where each submits p+1.
+  Uint p_sum;
+  if(y_max <= ~uint32_t(0)>>1) // compile time known
+    p_sum = Uint(uint32_t(pn)*uint32_t(pn+1)/2);
+  else
+    p_sum = Uint(uint64_t(pn)*uint64_t(pn+1)/2);
+  // Let s be the number of ranks per partition. This is either rn/pn as we
+  // intended, or y/p_sum if that's smaller to prevent overshooting our target y.
+  uint32_t s = y/p_sum < rn/pn ? y/p_sum : rn/pn;
+  x = (s != 0 && r/s < pn) ? 1 + r/s : 0; //  First s*pn ranks contribute partition index +1.
+  x += r == rn-1 ? y - s*p_sum : 0; // Last rank contributes discrepancy.
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    bool same_sign
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int x_sign = y_sign;
+  int xy_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+
+  // If we're using mixed signs then partition into evens and odds.
+  int subrank_n = same_sign ? rank_n : (rank_n+1)/2;
+  int subrank_me = same_sign ? rank_me : rank_me/2;
+  uintmant_t x0_mant, y0_mant;
+  genSumXY(subrank_n, subrank_me, rng, max_mant, x0_mant, y0_mant);
+
+  if (!same_sign && (rank_n+0)/2 != 0) {
+    uintmant_t x1_mant, y1_mant = y0_mant;
+    // Avoid generating y1_mant == y0_mant so we don't have to worry about
+    // signed zero as the result.
+    genSumXY((rank_n+0)/2, rank_me/2, rng, max_mant, x1_mant, y1_mant, /*avoid_y=*/true);
+    y_sign ^= y0_mant < y1_mant ? 1 : 0;
+    y0_mant = (y0_mant < y1_mant ? -1 : 1)*(y0_mant - y1_mant);
+    x_sign ^= rank_me%2;
+    x0_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  }
+
+  uintmant_t ans_mant = input_not_output ? x0_mant : y0_mant;
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = input_not_output ? x_sign : y_sign;
+    int ans_exp = xy_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatPreMulSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int y_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  int subrank_me0 = shuffleRank((rank_n+1)/2, rank_me/2, rng);
+  int subrank_me1 = shuffleRank((rank_n+0)/2, rank_me/2, rng);
+
+  // when ncclVerifiablePremulScalar() = 1.0 (rank_me%2 == 0)
+  uintmant_t x0_mant, y0_mant;
+  genSumXY((rank_n+1)/2, subrank_me0, rng, max_mant>>1, x0_mant, y0_mant);
+
+  // when ncclVerifiablePremulScalar() = 2.0 (rank_me%2 == 1)
+  uintmant_t x1_mant=0, y1_mant=0;
+  if((rank_n+0)/2 != 0)
+    genSumXY((rank_n+0)/2, subrank_me1, rng, max_mant>>2, x1_mant, y1_mant);
+
+  uintmant_t x_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  uintmant_t y_mant = y0_mant + 2*y1_mant;
+  uintmant_t ans_mant = input_not_output ? x_mant : y_mant;
+
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = y_sign;
+    int ans_exp = y_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatProd(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  // Three kinds of contributions (values for x):
+  // 1) x = random value: only one rank does this
+  // 2) x = 2^n: random positive n
+  // 3) x = 1
+  // Since only one rank submits a random value, the result of the product
+  // will have the same mantissa as that value but with an exponent incorporating
+  // the sum of the exponents from case (2)
+
+  uint64_t rng = hashOf(seed, index);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+  int y_sign = (rank_n/2)%2;
+  int x_sign = rank_me%2;
+
+  constexpr unsigned max_exp = -1 + (1<<(FloatLayout<T>::exponent_bits-1));
+  unsigned x_exp=0, y_exp=0;
+  genSumXY(rank_n, rank_me, rng, max_exp, x_exp, y_exp);
+  x_exp += FloatLayout<T>::exponent_bias;
+  y_exp += FloatLayout<T>::exponent_bias;
+
+  constexpr uint64_t mant_mask = (uint64_t(1)<<FloatLayout<T>::mantissa_bits)-1;
+  uint64_t y_mant = rng & mant_mask;
+  if (y_mant == 0) y_mant = 1;
+
+  return makeFloat<T>(
+    input_not_output ? x_sign : y_sign,
+    input_not_output ? x_exp : y_exp,
+    !input_not_output || rank_me==0 ? y_mant : 0
+  );
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// What follows is lots of overloads for genInput/genOutput to generate data
+
+namespace {
+// General case for integral data for all ops but ReduceNil/premulsum
+template<typename T, typename ReduceFn,
+         typename = typename std::enable_if<
+             !std::is_same<ReduceFn, ReduceNil>::value
+           >::type>
+__host__ __device__ void genInput(
+    T &ans, ReduceFn, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = uint64_t(-1)>>(64 - 8*sizeof(T));
+  bits &= hashOf(index ^ index<<16 ^ rank_me, seed);
+  // make sure we never return 0 in products
+  ans = std::is_same<ReduceFn, ReduceProd>::value && bits == 0 ? T(1) : tmp;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Dumb/generic case for genOutput just reduces results of genInput
+
+namespace {
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceFn op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  T acc = genInput<T>(op, rank_n, 0, seed, index);
+  acc = op.preOp(acc, 0);
+  for(int r=1; r < rank_n; r++)
+    acc = op(acc, op.preOp(genInput<T>(op, rank_n, r, seed, index), r));
+  ans = op.postOp(acc);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Nil reduction (byte copy functions). Optimized to assume rank_n=1
+
+namespace {
+template<typename T, bool IsIntegral>
+__host__ __device__ void genInput(
+    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  (void)rank_n, (void)rank_me; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = mixBits(seed ^ index);
+  bits >>= 64 - 8*sizeof(T);
+  bits &= uint64_t(-1)>>(64 - 8*sizeof(T));
+  ans = tmp;
+}
+
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  ans = genInput<T>(op, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Sum of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/false);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/false);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Product of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceProd, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceProd, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// PreMulSum of int/float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type integral
+  ) {
+  genInput(ans, ReduceSum(), rank_n, rank_me, seed, index, integral);
+}
+
+// No genOutput overload specific to premulsum(int), just use generic case.
+
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReducePreMulSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// Average of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
+  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+  //ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+  ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// min/max of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMin, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type integral
+  ) {
+  genInput<T>(ans, ReduceMax(), rank_n, rank_me, seed, index, integral);
+}
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMax, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
+  int sign = rng & 1;
+  rng ^= rng>>1;
+  int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
+  exp += 1<<(FloatLayout<T>::exponent_bits-2);
+  rng ^= rng >> FloatLayout<T>::exponent_bits;
+  uint64_t mant = rng & mant_mask;
+  ans = makeFloat<T>(sign, exp, mant);
+}
+
+// No genOutput overload specific to floating point min/max, just use generic case.
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Entry API for genInput/genOutput
+
+namespace {
+template<typename T, typename ReduceFn>
+__host__ __device__ T genInput(
+    ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genInput(ans, op, rank_n, rank_me, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+
+template<typename T, typename ReduceFn>
+__host__ __device__ T genOutput(
+    ReduceFn op, int rank_n, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genOutput(ans, op, rank_n, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareInput2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genInput<T>(op, rank_n, rank_me, seed, elt_ix0+i);
+    #if 0
+    T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
+      std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareInput1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(__half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareInput(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
+    else \
+      prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareExpected2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    #if 0
+    printf("prepareExpected2 seed=0x%llx ix=%lld x=%g elts=%p\n",
+      (long long)seed, (long long)(elt_ix0+i), (float)elts[i], elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareExpected1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(__half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareExpected(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
+    else \
+      prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+/* How we compare floating point values when exactness is impossible is interesting.
+ * First, we take note that simply reinterpreting integer bits as floating point
+ * gives us a monotonic mapping which exponentially spaces out floats. Thus
+ * consecutive integers encode consecutive floats. In general, using integer
+ * subraction on the bitpatterns of two floats gives us an integer which is the
+ * logarithm of their relative difference. But, if the floats always have similar
+ * exponents, than the integer difference is actually proportional to the
+ * relative error (this is because we are counting hops in the mantissa bits only,
+ * not the exponent bits). So a cheap way to compare if two floats are relatively
+ * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
+ * calculates such a tolerance for a summation of n floats. This formula
+ * was derived by inspecting the maximum observed integer difference over many
+ * random runs of summation. The parameter values were computed by the
+ * companion program "inexact_regress.cu".
+ */
+__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
+  float power, coef;
+  switch(elt_ty) {
+  case ncclFloat32:
+  case ncclFloat64:
+    power = .51f;
+    coef = 1.25f;
+    break;
+  case ncclFloat16:
+    power = .91f;
+    coef = .75f;
+    break;
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16:
+    power = .91f;
+    coef = .66f;
+    break;
+  #endif
+  }
+  #if __CUDA_ARCH__
+    return 1 + unsigned(coef*powf(float(rank_n), power));
+  #else
+    return 1 + unsigned(coef*std::pow(float(rank_n), power));
+  #endif
+}
+
+template<typename T>
+__host__ __device__  uint64_t calcDelta(T a, T b) {
+  union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
+  x.t = a;
+  y.t = b;
+  switch(sizeof(T)) {
+  case 1:  return x.i1 < y.i1 ? y.i1 - x.i1 : x.i1 - y.i1;
+  case 2:  return x.i2 < y.i2 ? y.i2 - x.i2 : x.i2 - y.i2;
+  case 4:  return x.i4 < y.i4 ? y.i4 - x.i4 : x.i4 - y.i4;
+  default: return x.i8 < y.i8 ? y.i8 - x.i8 : x.i8 - y.i8;
+  }
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T>
+__global__ void verifyPrepared(
+    T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    T a = results[i], b = expected[i];
+    T delta = a < b ? b - a : a - b;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      if(tolerance < delta) {
+        printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
+}
+
+template<typename T, typename Uint, typename ReduceFn>
+__global__ void verifyInline2(
+    T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
+    intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    union { T t; Uint u; } a, b;
+    a.t = results[i];
+    b.t = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    Uint delta = a.u < b.u ? b.u - a.u : a.u - b.u;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      T input = genInput<T>(op, rank_n, 0, seed, elt_ix0+i);
+      if(tolerance < delta) {
+        printf("verifyInline2 fail T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      } else {
+        printf("verifyInline2 pass T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long*)bad_elt_n, (unsigned long)bad);
+}
+
+template<typename T, typename Uint>
+void verifyInline1(
+    T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    else \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+}
+
+void ncclVerifiableVerify(
+    void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+    int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    int64_t *bad_elt_n, hipStream_t stream
+  ) {
+  bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
+  #if HAVE_ncclBfloat16
+    floating |= elt_ty == ncclBfloat16;
+  #endif
+
+  unsigned tolerance = 0;
+  #if HAVE_ncclAvg
+  if (floating && red_op == ncclAvg)
+    tolerance = calcSumFloatTolerance(rank_n, elt_ty);
+  #endif
+
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+
+  *bad_elt_n = 0;
+  #define CASE_TY(T, Uint) { \
+      if(expected != nullptr) { \
+        verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
+      } else { \
+        verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
+      } \
+    } break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t, uint8_t)
+  case ncclUint8: CASE_TY(uint8_t, uint8_t)
+  case ncclInt32: CASE_TY(int32_t, uint32_t)
+  case ncclUint32: CASE_TY(uint32_t, uint32_t)
+  case ncclInt64: CASE_TY(int64_t, uint64_t)
+  case ncclUint64: CASE_TY(uint64_t, uint64_t)
+  case ncclFloat16: CASE_TY(__half, uint16_t)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
+  #endif
+  case ncclFloat32: CASE_TY(float, uint32_t)
+  case ncclFloat64: CASE_TY(double, uint64_t)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if SELF_TEST
+#include <iostream>
+
+template<typename T, typename Op>
+__device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
+  //if(!std::is_same<T,half>::value) return;
+  //if(!std::is_same<Op,ReduceProd>::value) return;
+  //if(rank_n!=3) return;
+
+  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
+  uint64_t seed = 0xc8e2bed69766d533;
+
+  for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
+    //if(ix!=387) continue;
+    T y = genOutput<T>(op, rank_n, seed, ix);
+    T sum;
+    for(int r=0; r < rank_n; r++) {
+      T x = genInput<T>(op, rank_n, r, seed, ix);
+      x = op.preOp(x, r);
+      sum = r==0 ? x : op(sum, inhibit(x));
+      //std::printf("x = %llx, sum = %llx\n", bitsOf(x), bitsOf(sum));
+    }
+    sum = op.postOp(sum);
+    if(tolerance < calcDelta(sum, y)) {
+      printf(
+        //"%10g != %10g  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        "%llx != %llx  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
+      );
+    }
+  }
+}
+
+template<typename T>
+__device__ void sweep1(int ty, char const *tyname) {
+  for(int i=0; i < 10; i++) {
+    int rank_n = (1<<i) + i;
+    sweep2<T>(ty, tyname, ReduceSum(), "sum", rank_n);
+    sweep2<T>(ty, tyname, ReduceProd(), "prod", rank_n);
+    sweep2<T>(ty, tyname, ReduceMin(), "min", rank_n);
+    sweep2<T>(ty, tyname, ReduceMax(), "max", rank_n);
+    sweep2<T>(ty, tyname, ReducePreMulSum(), "premulsum", rank_n);
+    sweep2<T>(ty, tyname, ReduceAvg{rank_n}, "avg", rank_n);
+  }
+}
+
+__global__ void sweep() {
+  sweep1<int8_t>(ncclInt8, "int8");
+  sweep1<uint8_t>(ncclUint8, "uint8");
+  sweep1<int32_t>(ncclInt32, "int32");
+  sweep1<uint32_t>(ncclUint32, "uint32");
+  sweep1<int64_t>(ncclInt64, "int64");
+  sweep1<uint64_t>(ncclUint64, "uint64");
+  sweep1<__half>(ncclFloat16, "half");
+  #if HAVE_ncclBfloat16
+    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
+  #endif
+  sweep1<float>(ncclFloat32, "float");
+  sweep1<double>(ncclFloat64, "double");
+}
+
+int main(int arg_n, char **args) {
+  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
+  hipSetDevice(0);
+  sweep<<<1,512>>>();
+  hipDeviceSynchronize();
+  return 0;
+}
+#endif
--- a/rccl-test/verifiable/verifiable.h
+++ b/rccl-test/verifiable/verifiable.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef _d41d8cd98f00b204e9800998ecf8427e
+#define _d41d8cd98f00b204e9800998ecf8427e
+
+#include <hip/hip_runtime.h>
+
+#include <stdint.h>
+
+/* Routines for launching kernels that verify reduction results. A significant
+ * feature of these routines is they carefully craft floating point input
+ * to produce exactly predictable output.
+ *
+ * int elt_ty: actually just a ncclDataType_t
+ *
+ * int red_op: mostly just a  ncclRedOp_t. Since PreMulSum ops are dynamically
+ * created, these are encoded as the value ncclNumOps and their scalar is
+ * assumed to be `ncclVerifiablePremulScalar(rank_me)`
+ *
+ * uint64_t seed: arbitrary 64-bits to use in seeding the random values
+ *
+ * intptr_t elt_ix0: index of first element pointed to by elts when generating
+ * random values. This makes it possible to generate subsequences independently
+ * as well as in aggregate.
+ *
+ * int rank_n: Number of contributions into the reduction. Non-reduction
+ * collectives like broadcast, gather, etc will always set this to one.
+ *
+ * int rank_me: Index of this contribution
+ */
+
+// Use this as the local scalar for PreMulSum ops
+template<typename T>
+__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
+  return T(rank_me%2 == 0 ? 1.0f : 2.0f);
+}
+
+// Enqueue kernel to generate data which is to be reduced.
+void ncclVerifiablePrepareInput(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+);
+
+// Enqueue kernel to generate expected results of reduction.
+void ncclVerifiablePrepareExpected(
+  void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+  uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+);
+
+// Enqueue kernel to verify reduced data matches expectation. The number of
+// failed elements is written to bad_elt_n which must be in cudaHost memory.
+// If `expected == nullptr` then the expected results are generated on-the-fly
+// which can be costly. Thus if you plan to run the same reduction multiple
+// times it is advantageous to precompute the expected values with
+// ncclVerifiablePrepareExpected and pass them as `expected` here.
+void ncclVerifiableVerify(
+  void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+  int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+  int64_t *bad_elt_n, hipStream_t stream
+);
+#endif
--- a/rccl-test/verifiable/verifiable.mk
+++ b/rccl-test/verifiable/verifiable.mk
+# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+
+
+# We requires both of the following paths to be set upon including this makefile
+# TEST_VERIFIABLE_SRCDIR = <points to this directory>
+# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
+
+TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
+TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
+
+$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
+	@printf "Compiling %s\n" $@
+	@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
+	echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
--- a/rccl/.github/dependabot.yml
+++ b/rccl/.github/dependabot.yml
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/docs/sphinx" # Location of package manifests
+    open-pull-requests-limit: 10
+    schedule:
+      interval: "daily"
+    labels:
+      - "dependencies"
+      - "ci:docs-only"
+    reviewers:
+      - "samjwu"
--- a/rccl/.gitignore
+++ b/rccl/.gitignore
+# Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
+*.gcov
+/coverage/
+
+build/
+_build/
+_images/
+_static/
+_templates/
+_toc.yml
+docBin/
+_doxygen/
+
+# pytest tmp files
+*.pyc
+*.log
+__pycache__/
+*.txt
+.pytest_cache
\ No newline at end of file
--- a/rccl/.jenkins/common.groovy
+++ b/rccl/.jenkins/common.groovy
+// Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
+// This file is for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+def runCompileCommand(platform, project, jobName)
+{
+    project.paths.construct_build_prefix()
+
+    def command = """#!/usr/bin/env bash
+                set -x
+                cd ${project.paths.project_build_prefix}
+                ${project.paths.build_command}
+            """
+
+    platform.runCommand(this,command)
+}
+
+def runTestCommand (platform, project, gfilter, envars)
+{
+    String sudo = auxiliary.sudo(platform.jenkinsLabel)
+
+    def command = """#!/usr/bin/env bash
+                set -x
+                cd ${project.paths.project_build_prefix}/build/release/test
+                ${sudo} ulimit -l unlimited
+                ulimit -a
+                ${sudo} ${envars} RCCL_ENABLE_SIGNALHANDLER=0 NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./rccl-UnitTests --gtest_filter=${gfilter} --gtest_output=xml --gtest_color=yes
+            """
+
+   platform.runCommand(this, command)
+   junit "${project.paths.project_build_prefix}/build/release/test/*.xml"
+}
+
+def runPackageCommand(platform, project, jobName)
+{
+    def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release")
+
+    platform.runCommand(this, packageHelper[0])
+    platform.archiveArtifacts(this, packageHelper[1])
+}
+
+return this
--- a/rccl/.jenkins/extended.groovy
+++ b/rccl/.jenkins/extended.groovy
+#!/usr/bin/env groovy
+// Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
+// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
+@Library('rocJenkins@pong') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path
+
+def runCI =
+{
+    nodeDetails, jobName->
+
+    def prj  = new rocProject('rccl', 'Extended')
+
+    prj.timeout.test = 600
+    prj.paths.build_command = './install.sh -t'
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    boolean formatCheck = false
+
+    def commonGroovy
+
+    def compileCommand =
+    {
+        platform, project->
+
+        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
+        commonGroovy.runCompileCommand(platform, project, jobName)
+    }
+
+    def testCommand =
+    {
+        platform, project->
+
+        commonGroovy.runTestCommand(platform, project, "*", "")
+    }
+
+    def packageCommand =
+    {
+        platform, project->
+
+        commonGroovy.runPackageCommand(platform, project, jobName)
+    }
+
+    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
+}
+
+ci: {
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
+
+    propertyList = auxiliary.appendPropertyList(propertyList)
+
+    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([centos8:['8gfx906']])]
+
+    jobNameList = auxiliary.appendJobNameList(jobNameList)
+
+    propertyList.each
+    {
+        jobName, property->
+        if (urlJobName == jobName)
+            properties(auxiliary.addCommonProperties(property))
+    }
+
+    jobNameList.each
+    {
+        jobName, nodeDetails->
+        if (urlJobName == jobName)
+            stage(jobName) {
+                runCI(nodeDetails, jobName)
+            }
+    }
+
+    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
+    if(!jobNameList.keySet().contains(urlJobName))
+    {
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        stage(urlJobName) {
+            runCI([ubuntu18:['4gfx906']], urlJobName)
+        }
+    }
+}
--- a/rccl/.jenkins/precheckin.groovy
+++ b/rccl/.jenkins/precheckin.groovy
+#!/usr/bin/env groovy
+
+// Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
+// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
+@Library('rocJenkins@pong') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path
+
+def runCI =
+{
+    nodeDetails, jobName->
+
+    def prj  = new rocProject('rccl', 'PreCheckin')
+
+    prj.timeout.test = 300
+    prj.paths.build_command = './install.sh -t --fast'
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    boolean formatCheck = false
+
+    def commonGroovy
+
+    def compileCommand =
+    {
+        platform, project->
+
+        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
+        commonGroovy.runCompileCommand(platform, project, jobName)
+    }
+
+    def testCommand =
+    {
+        platform, project->
+
+        commonGroovy.runTestCommand(platform, project, "*", "UT_POW2_GPUS=1")
+    }
+
+    def packageCommand =
+    {
+        platform, project->
+
+        commonGroovy.runPackageCommand(platform, project, jobName)
+    }
+
+    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
+}
+
+ci: {
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]]
+
+    propertyList = auxiliary.appendPropertyList(propertyList)
+
+    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([sles15sp1:['4gfx906'],centos8:['8gfx908'],centos7:['8gfx906'],ubuntu18:['4gfx906', '4gfx908']])]
+
+    jobNameList = auxiliary.appendJobNameList(jobNameList)
+
+    propertyList.each
+    {
+        jobName, property->
+        if (urlJobName == jobName)
+            properties(auxiliary.addCommonProperties(property))
+    }
+
+    jobNameList.each
+    {
+        jobName, nodeDetails->
+        if (urlJobName == jobName)
+            stage(jobName) {
+                runCI(nodeDetails, jobName)
+            }
+    }
+
+    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
+    if(!jobNameList.keySet().contains(urlJobName))
+    {
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        stage(urlJobName) {
+            runCI([ubuntu18:['4gfx906']], urlJobName)
+        }
+    }
+}
--- a/rccl/.jenkins/staticanalysis.groovy
+++ b/rccl/.jenkins/staticanalysis.groovy
+#!/usr/bin/env groovy
+// Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
+// This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/
+@Library('rocJenkins@pong') _
+
+// This is file for internal AMD use.
+// If you are interested in running your own Jenkins, please raise a github issue for assistance.
+
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path
+
+def runCompileCommand(platform, project, jobName, boolean debug=false)
+{
+    project.paths.construct_build_prefix()
+}
+
+def runCI =
+{
+    nodeDetails, jobName->
+
+    def prj  = new rocProject('rccl-internal', 'StaticAnalysis')
+
+    // Define test architectures, optional rocm version argument is available
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    boolean formatCheck = false
+    boolean staticAnalysis = true
+
+    def compileCommand =
+    {
+        platform, project->
+
+        runCompileCommand(platform, project, jobName, false)
+    }
+
+    buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis)
+}
+
+ci: {
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])]))
+    stage(urlJobName) {
+        runCI([ubuntu20:['cpu']], urlJobName)
+    }
+}
--- a/rccl/.jenkins/staticlibrary.groovy
+++ b/rccl/.jenkins/staticlibrary.groovy
+#!/usr/bin/env groovy
+// Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved.
+@Library('rocJenkins@pong') _
+import com.amd.project.*
+import com.amd.docker.*
+import java.nio.file.Path;
+
+def runCI =
+{
+    nodeDetails, jobName->
+
+    def prj = new rocProject('rccl', 'Static Library PreCheckin')
+
+    prj.timeout.test = 1440
+    prj.paths.build_command = './install.sh -t --static'
+
+    def nodes = new dockerNodes(nodeDetails, jobName, prj)
+
+    def commonGroovy
+
+    boolean formatCheck = false
+
+    def compileCommand =
+    {
+        platform, project->
+
+        commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy"
+        commonGroovy.runCompileCommand(platform, project, jobName)
+    }
+
+
+    def testCommand =
+    {
+        platform, project->
+
+        commonGroovy.runTestCommand(platform, project, "*sum_float32*")
+    }
+
+    def packageCommand =
+    {
+        platform, project->
+
+        commonGroovy.runPackageCommand(platform, project, jobName)
+    }
+
+    buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand)
+}
+
+ci: {
+    String urlJobName = auxiliary.getTopJobName(env.BUILD_URL)
+
+    def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])],
+                        "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])],
+                        "rocm-docker":[]]
+    propertyList = auxiliary.appendPropertyList(propertyList)
+
+    def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
+                       "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]),
+                       "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])]
+    jobNameList = auxiliary.appendJobNameList(jobNameList)
+
+    propertyList.each
+    {
+        jobName, property->
+        if (urlJobName == jobName)
+            properties(auxiliary.addCommonProperties(property))
+    }
+
+    jobNameList.each
+    {
+        jobName, nodeDetails->
+        if (urlJobName == jobName)
+            stage(jobName) {
+                runCI(nodeDetails, jobName)
+            }
+    }
+
+    // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901
+    if(!jobNameList.keySet().contains(urlJobName))
+    {
+        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
+        stage(urlJobName) {
+            runCI([ubuntu16:['4gfx906']], urlJobName)
+        }
+    }
+}
--- a/rccl/.readthedocs.yaml
+++ b/rccl/.readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+version: 2
+
+build:
+   os: ubuntu-22.04
+   tools:
+      python: "3.8"
+
+sphinx:
+   configuration: docs/conf.py
+
+formats: [htmlzip, pdf, epub]
+
+python:
+   install:
+   - requirements: docs/sphinx/requirements.txt
--- a/rccl/CHANGELOG.md
+++ b/rccl/CHANGELOG.md
+# Change Log for RCCL
+
+Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
+
+## Unreleased
+### Changed
+- Compatibility with NCCL 2.16.2
+### Added
+### Fixed
+- Remove workaround and use indirect function call
+### Removed
+
+## Unreleased - RCCL 2.15.5 for ROCm 5.5.0
+### Changed
+- Compatibility with NCCL 2.15.5
+- Unit test executable renamed to rccl-UnitTests
+### Added
+- HW-topology aware binary tree implementation
+- Experimental support for MSCCL
+- New unit tests for hipGraph support
+- NPKit integration
+### Fixed
+- rocm-smi ID conversion
+- Support for HIP_VISIBLE_DEVICES for unit tests
+- Support for p2p transfers to non (HIP) visible devices
+### Removed
+- Removed TransferBench from tools.  Exists in standalone repo: https://github.com/ROCmSoftwarePlatform/TransferBench
+
+## RCCL-2.13.4 for ROCm 5.4.0
+### Changed
+- Compatibility with NCCL 2.13.4
+- Improvements to RCCL when running with hipGraphs
+- RCCL_ENABLE_HIPGRAPH environment variable is no longer necessary to enable hipGraph support
+- Minor latency improvements
+### Fixed
+- Resolved potential memory access error due to asynchronous memset
+
+## RCCL-2.12.10 for ROCm 5.3.0
+### Changed
+- Improvements to LL128 algorithms
+### Added
+- Adding initial hipGraph support via opt-in environment variable RCCL_ENABLE_HIPGRAPH
+- Integrating with NPKit (https://github.com/microsoft/NPKit) profiling code
+
+## RCCL-2.12.10 for ROCm 5.2.3
+### Added
+- Compatibility with NCCL 2.12.10
+- Packages for test and benchmark executables on all supported OSes using CPack.
+- Adding custom signal handler - opt-in with RCCL_ENABLE_SIGNALHANDLER=1
+  - Additional details provided if Binary File Descriptor library (BFD) is pre-installed
+- Adding support for reusing ports in NET/IB channels
+  - Opt-in with NCCL_IB_SOCK_CLIENT_PORT_REUSE=1 and NCCL_IB_SOCK_SERVER_PORT_REUSE=1
+  - When "Call to bind failed : Address already in use" error happens in large-scale AlltoAll
+    (e.g., >=64 MI200 nodes), users are suggested to opt-in either one or both of the options
+    to resolve the massive port usage issue
+  - Avoid using NCCL_IB_SOCK_SERVER_PORT_REUSE when NCCL_NCHANNELS_PER_NET_PEER is tuned >1
+### Removed
+- Removed experimental clique-based kernels
+
+## RCCL-2.11.4 for ROCm 5.2.0
+### Changed
+- Unit testing framework rework
+- Minor bug fixes
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## RCCL-2.11.4 for ROCm 5.1.0
+### Added
+- Compatibility with NCCL 2.11.4
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## RCCL-2.10.3 for ROCm 5.0.0
+### Added
+- Compatibility with NCCL 2.10.3
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## RCCL-2.9.9 for ROCm 4.5.0
+### Changed
+- Packaging split into a runtime package called rccl and a development package called rccl-devel. The development package depends on runtime. The runtime package suggests the development package for all supported OSes except CentOS 7 to aid in the transition. The suggests feature in packaging is introduced as a deprecated feature and will be removed in a future rocm release.
+### Added
+- Compatibility with NCCL 2.9.9
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## [RCCL-2.8.4 for ROCm 4.3.0]
+### Added
+- Ability to select the number of channels to use for clique-based all reduce (RCCL_CLIQUE_ALLREDUCE_NCHANNELS).  This can be adjusted to tune for performance when computation kernels are being executed in parallel.
+### Optimizations
+- Additional tuning for clique-based kernel AllReduce performance (still requires opt in with RCCL_ENABLE_CLIQUE=1)
+- Modification of default values for number of channels / byte limits for clique-based all reduce based on device architecture
+### Changed
+- Replaced RCCL_FORCE_ENABLE_CLIQUE to RCCL_CLIQUE_IGNORE_TOPO
+- Clique-based kernels can now be enabled on topologies where all active GPUs are XGMI-connected
+- Topologies not normally supported by clique-based kernels require RCCL_CLIQUE_IGNORE_TOPO=1
+### Fixed
+- Install script '-r' flag invoked alone no longer incorrectly deletes any existing builds.
+### Known issues
+- Managed memory is not currently supported for clique-based kernels
+
+## [RCCL-2.8.4 for ROCm 4.2.0]
+### Added
+- Compatibility with NCCL 2.8.4
+
+### Optimizations
+- Additional tuning for clique-based kernels
+- Enabling GPU direct RDMA read from GPU
+- Fixing potential memory leak issue when re-creating multiple communicators within same process
+- Improved topology detection
+### Known issues
+- None
+
+## [RCCL-2.7.8 for ROCm 4.1.0]
+### Added
+- Experimental support for clique-based kernels (opt in with RCCL_ENABLE_CLIQUE=1)
+- Clique-based kernels may offer better performance for smaller input sizes
+- Clique-based kernels are currently only enabled for AllReduce under a certain byte limit (controlled via RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT)
+### Optimizations
+- Performance improvements for Rome-based systems
+### Known issues
+- Clique-based kernels are currently experimental and have not been fully tested on all topologies.  By default, clique-based kernels are disabled if the detected topology is not supported (override with RCCL_FORCE_ENABLE_CLIQUE)
+- Clique-based kernels may hang if there are differences between environment variables set across ranks.
+- Clique-based kernels may fail if the input / output device pointers are not the base device pointers returned by hipMalloc.
+
+
+## [RCCL-2.7.8 for ROCm 3.9.0]
+### Added
+- Adding support for alltoallv RCCL kernel
+### Optimizations
+- Modifications to topology based on XGMI links
+### Known issues
+- None
+
+## [RCCL-2.7.6 for ROCm 3.8.0]
+### Added
+- Support for static library builds
+### Known issues
+- None
+
+## [RCCL-2.7.6 for ROCm 3.7.0]
+### Added
+- Updated to RCCL API version of 2.7.6
+- Added gather, scatter and all-to-all collectives
+
+## [RCCL-2.7.0 for ROCm 3.6.0]
+### Added
+- Updated to RCCL API version of 2.6.4
+
+## [RCCL-2.7.0 for ROCm 3.5.0]
+### Added
+- Compatibility with NCCL 2.6
+- Network interface improvements with API v3
+### Optimizations
+- Fixing issues and built time improvements for hip-clang
+- Network topology detection
+- Improved CPU type detection
+- Infiniband adaptive routing support
+### Changed
+- Switched to hip-clang as default compiler
+### Deprecated
+- Deprecated hcc build
--- a/rccl/Makefile
+++ b/rccl/Makefile
+#
+# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+.PHONY : all clean
+
+default : src.build
+install : src.install
+BUILDDIR ?= $(abspath ./build)
+ABSBUILDDIR := $(abspath $(BUILDDIR))
+TARGETS := src pkg
+clean: ${TARGETS:%=%.clean}
+test.build: src.build
+LICENSE_FILES := LICENSE.txt
+LICENSE_TARGETS := $(LICENSE_FILES:%=$(BUILDDIR)/%)
+lic: $(LICENSE_TARGETS)
+
+${BUILDDIR}/%.txt: %.txt
+	@printf "Copying    %-35s > %s\n" $< $@
+	mkdir -p ${BUILDDIR}
+	cp $< $@
+
+src.%:
+	${MAKE} -C src $* BUILDDIR=${ABSBUILDDIR}
+
+pkg.%:
+	${MAKE} -C pkg $* BUILDDIR=${ABSBUILDDIR}
+
+pkg.debian.prep: lic
+pkg.txz.prep: lic
--- a/rccl/README.md
+++ b/rccl/README.md
+# RCCL
+
+ROCm Communication Collectives Library
+
+## Introduction
+
+RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations.  It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+
+The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
+
+## Requirements
+
+1. ROCm supported GPUs
+2. ROCm stack installed on the system (HIP runtime & HIP-Clang)
+
+## Quickstart RCCL Build
+
+RCCL directly depends on HIP runtime plus the HIP-Clang compiler, which are part of the ROCm software stack.
+For ROCm installation instructions, see https://github.com/RadeonOpenCompute/ROCm.
+
+The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command.  It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install.
+
+```shell
+./install.sh --help
+
+ Options:
+       --address-sanitizer     Build with address sanitizer enabled
+       --build_allreduce_only  Build only AllReduce + sum + float kernel
+    -d|--dependencies          Install RCCL depdencencies
+       --debug                 Build debug library
+       --enable_backtrace      Build with custom backtrace support
+       --disable-colltrace     Build without collective trace
+       --disable-msccl-kernel  Build without MSCCL kernels
+    -f|--fast                  Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)
+    -h|--help                  Prints this help message
+    -i|--install               Install RCCL library (see --prefix argument below)
+    -j|--jobs                  Specify how many parallel compilation jobs to run (16 by default)
+    -l|--local_gpu_only        Only compile for local GPU architecture
+       --no_clean              Don't delete files if they already exist
+       --npkit-enable          Compile with npkit enabled
+    -p|--package_build         Build RCCL package
+       --prefix                Specify custom directory to install RCCL to (default: /opt/rocm)
+       --rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility
+       --run_tests_all         Run all rccl unit tests (must be built already)
+    -r|--run_tests_quick       Run small subset of rccl unit tests (must be built already)
+       --static                Build RCCL as a static library instead of shared library
+    -t|--tests_build           Build rccl unit tests, but do not run
+       --time-trace            Plot the build time of RCCL
+       --verbose               Show compile commands
+```
+
+## Manual build
+
+### To build the library :
+
+```shell
+$ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
+$ cd rccl
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ ..
+$ make -j 16      # Or some other suitable number of parallel jobs
+```
+You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example:
+```shell
+$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/opt/rocm/ -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
+```
+Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
+
+### To build the RCCL package and install package :
+
+Assuming you have already cloned this repository and built the library as shown in the previous section:
+
+```shell
+$ cd rccl/build
+$ make package
+$ sudo dpkg -i *.deb
+```
+
+RCCL package install requires sudo/root access because it creates a directory called "rccl" under /opt/rocm/. This is an optional step and RCCL can be used directly by including the path containing librccl.so.
+
+## Enabling peer-to-peer transport
+
+In order to enable peer-to-peer access on machines with PCIe-connected GPUs, the HSA environment variable HSA_FORCE_FINE_GRAIN_PCIE=1 is required to be set, on top of requiring GPUs that support peer-to-peer access and proper large BAR addressing support.
+
+## Tests
+
+There are rccl unit tests implemented with the Googletest framework in RCCL.  The rccl unit tests require Googletest 1.10 or higher to build and execute properly (installed with the -d option to install.sh).
+To invoke the rccl unit tests, go to the build folder, then the test subfolder, and execute the appropriate rccl unit test executable(s).
+
+rccl unit test names are now of the format:
+
+    CollectiveCall.[Type of test]
+
+Filtering of rccl unit tests should be done with environment variable and by passing the --gtest_filter command line flag, for example:
+
+```shell
+UT_DATATYPES=ncclBfloat16 UT_REDOPS=prod ./rccl-UnitTests --gtest_filter="AllReduce.C*"
+```
+will run only AllReduce correctness tests with float16 datatype. A list of available filtering environment variables appears at the top of every run. See "Running a Subset of the Tests" at https://chromium.googlesource.com/external/github.com/google/googletest/+/HEAD/googletest/docs/advanced.md for more information on how to form more advanced filters.
+
+
+There are also other performance and error-checking tests for RCCL.  These are maintained separately at https://github.com/ROCmSoftwarePlatform/rccl-tests.
+See the rccl-tests README for more information on how to build and run those tests.
+
+## NPKit
+
+RCCL integrates [NPKit](https://github.com/microsoft/npkit), a profiler framework that enables collecting fine-grained trace events in RCCL components, especially in giant collective GPU kernels.
+
+Please check [NPKit sample workflow for RCCL](https://github.com/microsoft/NPKit/tree/main/rccl_samples) as a fully automated usage example. It also provides good templates for the following manual instructions.
+
+To manually build RCCL with NPKit enabled, pass `-DNPKIT_FLAGS="-DENABLE_NPKIT -DENABLE_NPKIT_...(other NPKit compile-time switches)"` with cmake command. All NPKit compile-time switches are declared in the RCCL code base as macros with prefix `ENABLE_NPKIT_`, and they control which information will be collected. Also note that currently NPKit only supports collecting non-overlapped events on GPU, and `-DNPKIT_FLAGS` should follow this rule.
+
+To manually run RCCL with NPKit enabled, environment variable `NPKIT_DUMP_DIR` needs to be set as the NPKit event dump directory. Also note that currently NPKit only supports 1 GPU per process.
+
+To manually analyze NPKit dump results, please leverage [npkit_trace_generator.py](https://github.com/microsoft/NPKit/blob/main/rccl_samples/npkit_trace_generator.py).
+
+## Library and API Documentation
+
+Please refer to the [RCCL Documentation Site](https://rocm.docs.amd.com/projects/rccl/en/latest/) for current documentation.
+
+### How to build documentation
+
+Run the steps below to build documentation locally.
+
+```
+cd docs
+
+pip3 install -r sphinx/requirements.txt
+
+python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
+```
+
+## Copyright
+
+All source code and accompanying documentation is copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+
+All modifications are copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
--- a/rccl/build.sh
+++ b/rccl/build.sh
+#! /bin/bash
+
+ROCM_PATH=${ROCM_PATH:="/opt/rocm"}
+
+function build() {
+    rm -rf build
+    mkdir build && cd build
+    cmake -DCMAKE_INSTALL_PREFIX=${ROCM_PATH} -DAMDGPU_TARGETS="gfx906;gfx926;gfx928" -DBUILD_TESTS=ON ..
+    make -j 32
+}
+
+main() {
+
+    export CXX=${ROCM_PATH}/bin/hipcc
+    export HSA_FORCE_FINE_GRAIN_PCIE=1
+
+    echo -e "============ BUILD START =============\n"
+    sleep 5
+    
+    build
+    echo -e "\n\n============ BUILD END =============\n"
+
+}
+
+# 脚本的末尾调用主函数
+main "$@"
\ No newline at end of file
--- a/rccl/cmake/Dependencies.cmake
+++ b/rccl/cmake/Dependencies.cmake
+# MIT License
+#
+# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Dependencies
+
+# HIP dependency is handled earlier in the project cmake file
+# when VerifyCompiler.cmake is included.
+
+# GIT
+
+# Test dependencies
+
+# For downloading, building, and installing required dependencies
+include(cmake/DownloadProject.cmake)
+
+if(NOT INSTALL_DEPENDENCIES)
+    find_package(GTest 1.11)
+endif()
+
+if(NOT GTest_FOUND AND BUILD_TESTS OR INSTALL_DEPENDENCIES)
+    # if(CMAKE_CXX_COMPILER MATCHES ".*/hipcc$")
+    #     # hip-clang cannot compile googlebenchmark for some reason
+    #     set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++")
+    # endif()
+
+#       unset(GTEST_INCLUDE_DIR CACHE)
+#	unset(GTEST_INCLUDE_DIRS CACHE)
+    message(STATUS "GTest not found. Downloading and building GTest.")
+    # Download, build and install googletest library
+    set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "")
+
+    download_project(PROJ                googletest
+                     GIT_REPOSITORY      http://huchen:huchen@$ENV{GITLAB_SERVER}/Mirrors/googletest.git
+                     GIT_TAG             release-1.11.0
+                     INSTALL_DIR         ${GTEST_ROOT}
+                     CMAKE_ARGS          -DBUILD_GTEST=ON -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${COMPILER_OVERRIDE} -DBUILD_SHARED_LIBS=OFF
+                     LOG_DOWNLOAD        TRUE
+                     LOG_CONFIGURE       TRUE
+                     LOG_BUILD           TRUE
+                     LOG_INSTALL         TRUE
+                     UPDATE_DISCONNECTED TRUE
+    )
+    set(GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/gtest/include CACHE PATH "")
+    if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib)
+        set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib/libgtest_main.a CACHE PATH "")
+    elseif(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64)
+        set(GTEST_BOTH_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest.a;${CMAKE_CURRENT_BINARY_DIR}/gtest/lib64/libgtest_main.a CACHE PATH "")
+    else()
+        message(FATAL_ERROR "Cannot find gtest library installation path.")
+    find_package(GTest REQUIRED CONFIG PATHS ${GTEST_ROOT})
+    endif()
+endif()
+
+set(DATATYPES_INT
+"int8_t"
+"uint8_t"
+"int32_t"
+"uint32_t"
+"int64_t"
+"uint64_t"
+  )
+set(DATATYPES_FLOAT
+  "half"
+  "float"
+  "double"
+  "rccl_bfloat16"
+  )
+
+function(expand_collectives FILE FUNC)
+  set(REDOP Sum Prod Min Max PreMulSum SumPostDiv)
+  if (FUNC STREQUAL "MscclKernel")
+    set(REDOP_FILTERED Sum Prod Min Max PreMulSum SumPostDiv)
+  else()
+    set(REDOP_FILTERED ${REDOP})
+  endif()
+  foreach(REDOP_CURRENT IN LISTS REDOP_FILTERED)
+    foreach(DATA_TYPE ${DATATYPES_INT} ${DATATYPES_FLOAT})
+      if (REDOP_CURRENT STREQUAL "SumPostDiv" AND DATA_TYPE IN_LIST DATATYPES_FLOAT)
+        continue()  # Skip the iteration for DATATYPES_FLOAT when REDOP_CURRENT is SumPostDiv
+      endif()
+      set(FILE_NAME "${HIPIFY_DIR}/src/collectives/device/${FILE}_${REDOP_CURRENT}_${DATA_TYPE}.cpp")
+      message(STATUS "Generating ${FILE_NAME}")
+      if (FUNC STREQUAL "MscclKernel")
+        file(WRITE ${FILE_NAME}
+          "#include \"${FILE}_impl.h\"\n"
+          "#include \"primitives.h\"\n"
+          "#include \"collectives.h\"\n"
+          "#include \"devcomm.h\"\n"
+          "MSCCL_IMPL_KERNEL_ENTRY_FUNC_DEVREDOP_TYPE(${REDOP_CURRENT}, ${DATA_TYPE}, false);")
+      else()
+        file(WRITE ${FILE_NAME}
+          "#include \"${FILE}.h\"\n"
+          "#include \"common.h\"\n"
+          "#include \"collectives.h\"\n"
+          "IMPL_COLL3(${FUNC}, ${REDOP_CURRENT}, ${DATA_TYPE});")
+      endif()
+      list(APPEND HIP_SOURCES ${FILE_NAME})
+    endforeach()
+  endforeach()
+  if (KERNELNAME)
+    if (NOT FUNC STREQUAL "MscclKernel")
+      set(FILE_NAME "${HIPIFY_DIR}/src/collectives/device/${FILE}.cpp")
+      file(WRITE ${FILE_NAME}
+        "#include \"${FILE}.h\"\n"
+        "#include \"common.h\"\n"
+        "#include \"collectives.h\"\n"
+        "IMPL_COLL_KERN(${FUNC}, RING, SIMPLE, Sum, int8_t, FUNC_INDEX_P2P);")
+      list(APPEND HIP_SOURCES ${FILE_NAME})
+    endif()
+  endif()
+  set(HIP_SOURCES ${HIP_SOURCES} PARENT_SCOPE)
+endfunction()
+
+# Find or download/install rocm-cmake project
+set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern )
+find_package(ROCM QUIET CONFIG PATHS /opt/rocm)
+if(NOT ROCM_FOUND)
+    set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download")
+    file(
+        DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip
+        ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
+        STATUS rocm_cmake_download_status LOG rocm_cmake_download_log
+    )
+    list(GET rocm_cmake_download_status 0 rocm_cmake_download_error_code)
+    if(rocm_cmake_download_error_code)
+        message(FATAL_ERROR "Error: downloading "
+            "https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip failed "
+            "error_code: ${rocm_cmake_download_error_code} "
+            "log: ${rocm_cmake_download_log} "
+        )
+    endif()
+
+    execute_process(
+        COMMAND ${CMAKE_COMMAND} -E tar xzf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip
+        WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}
+        RESULT_VARIABLE rocm_cmake_unpack_error_code
+    )
+    execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake .
+      WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} )
+    execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install
+      WORKING_DIRECTORY ${PROJECT_EXTERN_DIR})
+
+    if(rocm_cmake_unpack_error_code)
+        message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip failed")
+    endif()
+    find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
+endif()
+
+# Find available local ROCM targets
+# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
+function(rocm_local_targets VARIABLE)
+  set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
+  find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
+  if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
+    execute_process(
+      COMMAND "${_rocm_agent_enumerator}"
+      RESULT_VARIABLE _found_agents
+      OUTPUT_VARIABLE _rocm_agents
+      ERROR_QUIET
+      )
+    if (_found_agents EQUAL 0)
+      string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
+      unset(result)
+      foreach (agent IN LISTS _rocm_agents)
+        if (NOT agent STREQUAL "gfx000")
+          list(APPEND result "${agent}")
+        endif()
+      endforeach()
+      if(result)
+        list(REMOVE_DUPLICATES result)
+        set(${VARIABLE} "${result}" PARENT_SCOPE)
+      endif()
+    endif()
+  endif()
+endfunction()
+
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMInstallSymlinks)
+include(ROCMCheckTargetIds)
+include(ROCMClients)
+include(ROCMHeaderWrapper)
--- a/rccl/cmake/DownloadProject.CMakeLists.cmake.in
+++ b/rccl/cmake/DownloadProject.CMakeLists.cmake.in
+# Distributed under the OSI-approved MIT License.  See accompanying
+# file LICENSE or https://github.com/Crascit/DownloadProject for details.
+
+cmake_minimum_required(VERSION 2.8.2)
+
+project(${DL_ARGS_PROJ}-download NONE)
+
+include(ExternalProject)
+ExternalProject_Add(${DL_ARGS_PROJ}-download
+                    ${DL_ARGS_UNPARSED_ARGUMENTS}
+                    SOURCE_DIR          "${DL_ARGS_SOURCE_DIR}"
+                    BUILD_IN_SOURCE     TRUE
+                    TEST_COMMAND        ""
+)
\ No newline at end of file
--- a/rccl/cmake/DownloadProject.cmake
+++ b/rccl/cmake/DownloadProject.cmake
+# Distributed under the OSI-approved MIT License.  See accompanying
+# file LICENSE or https://github.com/Crascit/DownloadProject for details.
+#
+# MODULE:   DownloadProject
+#
+# PROVIDES:
+#   download_project( PROJ projectName
+#                    [PREFIX prefixDir]
+#                    [DOWNLOAD_DIR downloadDir]
+#                    [SOURCE_DIR srcDir]
+#                    [BINARY_DIR binDir]
+#                    [QUIET]
+#                    ...
+#   )
+#
+#       Provides the ability to download and unpack a tarball, zip file, git repository,
+#       etc. at configure time (i.e. when the cmake command is run). How the downloaded
+#       and unpacked contents are used is up to the caller, but the motivating case is
+#       to download source code which can then be included directly in the build with
+#       add_subdirectory() after the call to download_project(). Source and build
+#       directories are set up with this in mind.
+#
+#       The PROJ argument is required. The projectName value will be used to construct
+#       the following variables upon exit (obviously replace projectName with its actual
+#       value):
+#
+#           projectName_SOURCE_DIR
+#           projectName_BINARY_DIR
+#
+#       The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically
+#       need to be provided. They can be specified if you want the downloaded source
+#       and build directories to be located in a specific place. The contents of
+#       projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the
+#       locations used whether you provide SOURCE_DIR/BINARY_DIR or not.
+#
+#       The DOWNLOAD_DIR argument does not normally need to be set. It controls the
+#       location of the temporary CMake build used to perform the download.
+#
+#       The PREFIX argument can be provided to change the base location of the default
+#       values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments
+#       are provided, then PREFIX will have no effect. The default value for PREFIX is
+#       CMAKE_BINARY_DIR.
+#
+#       The QUIET option can be given if you do not want to show the output associated
+#       with downloading the specified project.
+#
+#       In addition to the above, any other options are passed through unmodified to
+#       ExternalProject_Add() to perform the actual download, patch and update steps.
+#
+#       Only those ExternalProject_Add() arguments which relate to downloading, patching
+#       and updating of the project sources are intended to be used. Also note that at
+#       least one set of download-related arguments are required.
+#
+#       If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to
+#       prevent a check at the remote end for changes every time CMake is run
+#       after the first successful download. See the documentation of the ExternalProject
+#       module for more information. It is likely you will want to use this option if it
+#       is available to you. Note, however, that the ExternalProject implementation contains
+#       bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when
+#       using the URL download method or when specifying a SOURCE_DIR with no download
+#       method. Fixes for these have been created, the last of which is scheduled for
+#       inclusion in CMake 3.8.0. Details can be found here:
+#
+#           https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c
+#           https://gitlab.kitware.com/cmake/cmake/issues/16428
+#
+#       If you experience build errors related to the update step, consider avoiding
+#       the use of UPDATE_DISCONNECTED.
+#
+# EXAMPLE USAGE:
+#
+#   include(DownloadProject)
+#   download_project(PROJ                googletest
+#                    GIT_REPOSITORY      https://github.com/google/googletest.git
+#                    GIT_TAG             master
+#                    UPDATE_DISCONNECTED 1
+#                    QUIET
+#   )
+#
+#   add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR})
+#
+#========================================================================================
+
+
+set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}")
+
+include(CMakeParseArguments)
+
+function(download_project)
+
+    set(options QUIET)
+    set(oneValueArgs
+        PROJ
+        PREFIX
+        DOWNLOAD_DIR
+        SOURCE_DIR
+        BINARY_DIR
+    )
+    set(multiValueArgs "")
+
+    cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    # Hide output if requested
+    if (DL_ARGS_QUIET)
+        set(OUTPUT_QUIET "OUTPUT_QUIET")
+    else()
+        unset(OUTPUT_QUIET)
+        message(STATUS "Downloading/updating ${DL_ARGS_PROJ}")
+    endif()
+
+    # Set up where we will put our temporary CMakeLists.txt file and also
+    # the base point below which the default source and binary dirs will be.
+    # The prefix must always be an absolute path.
+    if (NOT DL_ARGS_PREFIX)
+        set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}")
+    else()
+        get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE
+                               BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    endif()
+    if (NOT DL_ARGS_DOWNLOAD_DIR)
+        set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download")
+    endif()
+
+    # Ensure the caller can know where to find the source and build directories
+    if (NOT DL_ARGS_SOURCE_DIR)
+        set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src")
+    endif()
+    if (NOT DL_ARGS_BINARY_DIR)
+        set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build")
+    endif()
+    set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE)
+    set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE)
+
+    # The way that CLion manages multiple configurations, it causes a copy of
+    # the CMakeCache.txt to be copied across due to it not expecting there to
+    # be a project within a project.  This causes the hard-coded paths in the
+    # cache to be copied and builds to fail.  To mitigate this, we simply
+    # remove the cache if it exists before we configure the new project.  It
+    # is safe to do so because it will be re-generated.  Since this is only
+    # executed at the configure step, it should not cause additional builds or
+    # downloads.
+    file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt")
+
+    # Create and build a separate CMake project to carry out the download.
+    # If we've already previously done these steps, they will not cause
+    # anything to be updated, so extra rebuilds of the project won't occur.
+    # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project
+    # has this set to something not findable on the PATH.
+    configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in"
+                   "${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt")
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}"
+                        -D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}"
+                        .
+                    RESULT_VARIABLE result
+                    ${OUTPUT_QUIET}
+                    WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
+    )
+    if(result)
+        message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}")
+    endif()
+    execute_process(COMMAND ${CMAKE_COMMAND} --build .
+                    RESULT_VARIABLE result
+                    ${OUTPUT_QUIET}
+                    WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}"
+    )
+    if(result)
+        message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}")
+    endif()
+
+endfunction()
\ No newline at end of file
--- a/rccl/cmake/git_version.cmake
+++ b/rccl/cmake/git_version.cmake
+# Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Attempt to collect the latest git hash
+execute_process(COMMAND git log --pretty=format:'%h' -n 1
+                OUTPUT_VARIABLE GIT_REV
+                ERROR_QUIET)
+
+# Check if git information was found
+if ("${GIT_REV}" STREQUAL "")
+  set(CURR_GIT_VERSION "const char *rcclGitHash =\"Unknown \";")
+else()
+  # Check for changes (denote with a '+') after hash
+  execute_process(
+    COMMAND bash -c "git diff --quiet --exit-code || echo +"
+    OUTPUT_VARIABLE GIT_DIFF)
+  # Collect branch information
+  execute_process(
+    COMMAND git rev-parse --abbrev-ref HEAD
+    OUTPUT_VARIABLE GIT_BRANCH)
+
+  string(STRIP "${GIT_REV}" GIT_REV)
+  string(SUBSTRING "${GIT_REV}" 1 7 GIT_REV)
+  string(STRIP "${GIT_DIFF}" GIT_DIFF)
+  string(STRIP "${GIT_BRANCH}" GIT_BRANCH)
+
+  set(CURR_GIT_VERSION "const char *rcclGitHash =\"${GIT_BRANCH}:${GIT_REV}${GIT_DIFF}\";")
+endif()
+
+# Compare file with older git version file (git_version.cpp)
+if (EXISTS ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp)
+  #MESSAGE(STATUS "Found ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp")
+  file(READ ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp PREV_GIT_VERSION)
+  #message(STATUS "CURR GIT version: ${CURR_GIT_VERSION}")
+  #message(STATUS "PREV GIT version: ${PREV_GIT_VERSION}")
+  if (NOT "${CURR_GIT_VERSION}" STREQUAL "${PREV_GIT_VERSION}")
+    message(STATUS "Updating git_version.cpp")
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+  else()
+    message(STATUS "No changes to git_version.cpp required")
+  endif()
+else()
+  # Create git_version.cpp if it doesn't exist yet
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/git_version.cpp "${CURR_GIT_VERSION}")
+endif()