Initial commit

d8ca0a9e · jerrrrry · d8ca0a9e · d8ca0a9e · d8ca0a9e · d8ca0a9e
Commit d8ca0a9e authored Jul 18, 2025 by jerrrrry
20 changed files
--- a/src/gather.cu
+++ b/src/gather.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "hip/hip_runtime.h"
+#include "common.h"
+
+void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count/nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = count/nranks;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks;
+}
+
+testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+      HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
+      if (rank == root) {
+	for (int j=0; j<nranks; j++) {
+	  TESTCHECK(InitData(((char*)args->expected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
+	}
+      }
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl gatherTest = {
+  "Gather",
+  GatherGetCollByteCount,
+  GatherInitData,
+  GatherGetBw,
+  GatherRunColl
+};
+
+void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &gatherTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  GatherGetBuffSize,
+  GatherRunTest
+};
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "hip/hip_runtime.h"
+#include "common.h"
+
+#define ALIGN 4
+
+void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
+  *recvInplaceOffset = 0;
+  *paramcount = base;
+}
+
+testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+      for (int j=0; j<nranks; j++) {
+	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
+      }
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  char* sbuff = (char*)sendbuff;
+  char* rbuff = (char*)recvbuff;
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankSize = count * wordSize(type);
+  if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
+
+  // Hypercube AllGather
+  for (int mask=1; mask<nRanks; mask<<=1) {
+    NCCLCHECK(ncclGroupStart());
+    int s = rank & ~(mask-1);
+    int r = s ^ mask;
+    NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclGroupEnd());
+  }
+  return testSuccess;
+}
+
+struct testColl hyperCubeTest = {
+  "HyperCube",
+  HyperCubeGetCollByteCount,
+  HyperCubeInitData,
+  HyperCubeGetBw,
+  HyperCubeRunColl
+};
+
+void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &hyperCubeTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  // Check if this is a power of 2
+  int nRanks = args->nProcs*args->nThreads*args->nGpus;
+  if (nRanks && !(nRanks & (nRanks - 1))) {
+    for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+    }
+  } else {
+    printf("nRanks %d is not a power of 2, skipping\n", nRanks);
+  }
+
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  HyperCubeGetBuffSize,
+  HyperCubeRunTest
+};
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    hipStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
--- a/src/rccl_bfloat16.h
+++ b/src/rccl_bfloat16.h
+/**
+ * MIT License
+ *
+ * Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*!\file
+ * \brief rccl_bfloat16.h provides struct for rccl_bfloat16 typedef
+ */
+
+#ifndef _RCCL_BFLOAT16_H_
+#define _RCCL_BFLOAT16_H_
+
+#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
+
+// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
+// include a minimal definition of rccl_bfloat16
+
+#include <stdint.h>
+/*! \brief Struct to represent a 16 bit brain floating point number. */
+typedef struct
+{
+    uint16_t data;
+} rccl_bfloat16;
+
+#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <ostream>
+#include <type_traits>
+
+struct rccl_bfloat16
+{
+    uint16_t data;
+
+    enum truncate_t
+    {
+        truncate
+    };
+
+    __host__ __device__ rccl_bfloat16() = default;
+
+    // round upper 16 bits of IEEE float to convert to bfloat16
+    explicit __host__ __device__ rccl_bfloat16(float f)
+        : data(float_to_bfloat16(f))
+    {
+    }
+
+    explicit __host__ __device__ rccl_bfloat16(float f, truncate_t)
+        : data(truncate_float_to_bfloat16(f))
+    {
+    }
+
+    // zero extend lower 16 bits of bfloat16 to convert to IEEE float
+    __host__ __device__ operator float() const
+    {
+        union
+        {
+            uint32_t int32;
+            float    fp32;
+        } u = {uint32_t(data) << 16};
+        return u.fp32;
+    }
+
+private:
+    static __host__ __device__ uint16_t float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        if(~u.int32 & 0x7f800000)
+        {
+            // When the exponent bits are not all 1s, then the value is zero, normal,
+            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+            // least significant bits of the float mantissa are greater than 0x8000,
+            // or if they are equal to 0x8000 and the least significant bit of the
+            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+            // has the value 0x7f, then incrementing it causes it to become 0x00 and
+            // the exponent is incremented by one, which is the next higher FP value
+            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+            // incrementing it causes it to become an exponent of 0xFF and a mantissa
+            // of 0x00, which is Inf, the next higher value to the unrounded value.
+            u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
+        }
+        else if(u.int32 & 0xffff)
+        {
+            // When all of the exponent bits are 1, the value is Inf or NaN.
+            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+            // bit being 1. Signaling NaN is indicated by the most significant
+            // mantissa bit being 0 but some other bit(s) being 1. If any of the
+            // lower 16 bits of the mantissa are 1, we set the least significant bit
+            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+            // the bloat16's mantissa bits are all 0.
+            u.int32 |= 0x10000; // Preserve signaling NaN
+        }
+        return uint16_t(u.int32 >> 16);
+    }
+
+    // Truncate instead of rounding, preserving SNaN
+    static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
+    }
+};
+
+typedef struct
+{
+    uint16_t data;
+} rccl_bfloat16_public;
+
+static_assert(std::is_standard_layout<rccl_bfloat16>{},
+              "rccl_bfloat16 is not a standard layout type, and thus is "
+              "incompatible with C.");
+
+static_assert(std::is_trivial<rccl_bfloat16>{},
+              "rccl_bfloat16 is not a trivial type, and thus is "
+              "incompatible with C.");
+
+static_assert(sizeof(rccl_bfloat16) == sizeof(rccl_bfloat16_public)
+                  && offsetof(rccl_bfloat16, data) == offsetof(rccl_bfloat16_public, data),
+              "internal rccl_bfloat16 does not match public rccl_bfloat16");
+
+inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat16& bf16)
+{
+    return os << float(bf16);
+}
+inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a)
+{
+    return a;
+}
+inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a)
+{
+    a.data ^= 0x8000;
+    return a;
+}
+inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) + float(b));
+}
+inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) - float(b));
+}
+inline __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) * float(b));
+}
+inline __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) / float(b));
+}
+inline __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return float(a) < float(b);
+}
+inline __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return float(a) == float(b);
+}
+inline __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return b < a;
+}
+inline __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return !(a > b);
+}
+inline __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return !(a == b);
+}
+inline __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return !(a < b);
+}
+inline __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a + b;
+}
+inline __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a - b;
+}
+inline __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a * b;
+}
+inline __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a / b;
+}
+inline __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a)
+{
+    return a += rccl_bfloat16(1.0f);
+}
+inline __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a)
+{
+    return a -= rccl_bfloat16(1.0f);
+}
+inline __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int)
+{
+    rccl_bfloat16 orig = a;
+    ++a;
+    return orig;
+}
+inline __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int)
+{
+    rccl_bfloat16 orig = a;
+    --a;
+    return orig;
+}
+
+namespace std
+{
+    constexpr __host__ __device__ bool isinf(rccl_bfloat16 a)
+    {
+        return !(~a.data & 0x7f80) && !(a.data & 0x7f);
+    }
+    constexpr __host__ __device__ bool isnan(rccl_bfloat16 a)
+    {
+        return !(~a.data & 0x7f80) && +(a.data & 0x7f);
+    }
+    constexpr __host__ __device__ bool iszero(rccl_bfloat16 a)
+    {
+        return !(a.data & 0x7fff);
+    }
+    inline rccl_bfloat16 sin(rccl_bfloat16 a)
+    {
+        return rccl_bfloat16(sinf(float(a)));
+    }
+    inline rccl_bfloat16 cos(rccl_bfloat16 a)
+    {
+        return rccl_bfloat16(cosf(float(a)));
+    }
+}
+
+#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#endif // _RCCL_BFLOAT16_H_
--- a/src/reduce.cu
+++ b/src/reduce.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+      HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
+      if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  *busBw = baseBw;
+}
+
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
+  return testSuccess;
+}
+
+struct testColl reduceTest = {
+  "Reduce",
+  ReduceGetCollByteCount,
+  ReduceInitData,
+  ReduceGetBw,
+  ReduceRunColl
+};
+
+void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      for (int k=begin_root; k<=end_root; k++) {
+        TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
+      }
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ReduceGetBuffSize,
+  ReduceRunTest
+};
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+#define ALIGN 4
+
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base*nranks;
+  *recvcount = base;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = base;
+  *paramcount = base;
+}
+
+testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+      HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
+      TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+
+struct testColl reduceScatterTest = {
+  "ReduceScatter",
+  ReduceScatterGetCollByteCount,
+  ReduceScatterInitData,
+  ReduceScatterGetBw,
+  ReduceScatterRunColl
+};
+
+void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceScatterTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    run_ops = &op;
+    run_opnames = &opName;
+    op_count = 1;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ReduceScatterGetBuffSize,
+  ReduceScatterRunTest
+};
--- a/src/scatter.cu
+++ b/src/scatter.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = count/nranks;
+}
+
+testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+      TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
+      k++;
+
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl scatterTest = {
+  "Scatter",
+  ScatterGetCollByteCount,
+  ScatterInitData,
+  ScatterGetBw,
+  ScatterRunColl
+};
+
+void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &scatterTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ScatterGetBuffSize,
+  ScatterRunTest
+};
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+      int peer = (rank-1+nranks)%nranks;
+      TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  // We don't support in-place sendrecv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  int recvPeer = (rank-1+nRanks) % nRanks;
+  int sendPeer = (rank+1) % nRanks;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
+  NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+struct testColl sendRecvTest = {
+  "SendRecv",
+  SendRecvGetCollByteCount,
+  SendRecvInitData,
+  SendRecvGetBw,
+  SendRecvRunColl
+};
+
+void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &sendRecvTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  SendRecvGetBuffSize,
+  SendRecvRunTest
+};
--- a/src/timer.cc
+++ b/src/timer.cc
+#include "timer.h"
+
+// Make sure to compile this translation unit with the host compiler and not
+// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
+#include <chrono>
+
+namespace {
+  std::uint64_t now() {
+    using clock = std::chrono::steady_clock;
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
+  }
+}
+
+timer::timer() {
+  t0 = now();
+}
+
+double timer::elapsed() const {
+  std::uint64_t t1 = now();
+  return 1.e-9*(t1 - t0);
+}
+
+double timer::reset() {
+  std::uint64_t t1 = now();
+  double ans = 1.e-9*(t1 - t0);
+  t0 = t1;
+  return ans;
+}
--- a/src/timer.h
+++ b/src/timer.h
+#ifndef _408319ecdd5b47b28bf8f511c4fdf816
+#define _408319ecdd5b47b28bf8f511c4fdf816
+
+#include <cstdint>
+
+// Can't include <chrono> because of bug with gcc 10.3.0
+class timer {
+  std::uint64_t t0;
+public:
+  timer();
+  double elapsed() const;
+  double reset();
+};
+
+#endif
--- a/test/__init__.py
+++ b/test/__init__.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
\ No newline at end of file
--- a/test/conftest.py
+++ b/test/conftest.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+def pytest_addoption(parser):
+    parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile")
\ No newline at end of file
--- a/test/test_AllGather.py
+++ b/test/test_AllGather.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/all_gather_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllGather test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllGather test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/test/test_AllReduce.py
+++ b/test/test_AllReduce.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/all_reduce_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllReduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllReduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/test/test_Broadcast.py
+++ b/test/test_Broadcast.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/broadcast_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Broadcast test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Broadcast test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/test/test_Reduce.py
+++ b/test/test_Reduce.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/reduce_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Reduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Reduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/test/test_ReduceScatter.py
+++ b/test/test_ReduceScatter.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/reduce_scatter_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("ReduceScatter test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("ReduceScatter test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/verifiable/Makefile
+++ b/verifiable/Makefile
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+#include ../../makefiles/common.mk
+
+.PHONY: all clean
+
+BUILDDIR := $(abspath ../../build)
+DST_DIR := $(BUILDDIR)/test/verifiable
+
+ROCM_PATH ?= /opt/rocm
+MPI_HOME ?= /usr/lib/openmpi
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+NCCL_HOME ?= ""
+
+HIPCC = $(ROCM_PATH)/bin/hipcc
+CXX = $(HIPCC)
+
+HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+endif
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+
+ifeq ($(DEBUG), 0)
+HIPCUFLAGS += -O3
+else
+HIPCUFLAGS += -O0 -g -ggdb3
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+ifeq ($(MPI), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+else ifeq ($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L/usr/lib -lmpich
+endif
+
+LIBRARIES += rccl
+HIPLDFLAGS   += $(LIBRARIES:%=-l%)
+
+all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test 
+
+clean:
+	rm -rf $(DST_DIR)
+
+TEST_VERIFIABLE_SRCDIR := .
+TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
+include verifiable.mk
+
+self_test: $(DST_DIR)/self_test
+
+$(DST_DIR)/self_test: verifiable.cu verifiable.h
+	@printf "Linking  %s\n" $@
+	@mkdir -p $(DST_DIR)
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
--- a/verifiable/inexact_regress.cu
+++ b/verifiable/inexact_regress.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/* Generate parameters for our error bound model of floating point average
+ * (sum of scaled values) by sampling sums of random sequences for each
+ * floating point type.
+ *
+ * The model has parameters "coef" and "power", where for two floats a & b,
+ * they are close enough if and only if:
+ *   abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
+ *
+ * Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
+ *
+ * Compile with:
+ *   nvcc -gencode=arch=compute_80,code=sm_80
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
+
+using std::uint64_t;
+using std::uint32_t;
+using bfloat16 = hip_bfloat16;
+
+template<typename T>
+struct float_traits;
+
+template<>
+struct float_traits<float> {
+  static constexpr int mantissa_bits = 23;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint32_t;
+  __device__ static float make(double x) { return (float)x; }
+  __device__ static float make(uint64_t x) { return (float)x; }
+  __device__ static double todouble(float x) { return x; }
+  __device__ static float add(float a, float b) { return a+b; }
+  __device__ static float mul(float a, float b) { return a*b; }
+};
+template<>
+struct float_traits<double> {
+  static constexpr int mantissa_bits = 52;
+  static constexpr int exponent_bits = 11;
+  using uint_t = uint64_t;
+  __device__ static double make(double x) { return x; }
+  __device__ static double make(uint64_t x) { return (double)x; }
+  __device__ static double todouble(double x) { return x; }
+  __device__ static double add(double a, double b) { return a+b; }
+  __device__ static double mul(double a, double b) { return a*b; }
+};
+template<>
+struct float_traits<__half> {
+  static constexpr int mantissa_bits = 10;
+  static constexpr int exponent_bits = 5;
+  using uint_t = uint16_t;
+  __device__ static __half make(double x) { return __float2half((float)x); }
+  __device__ static __half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(__half x) { return __half2float(x); }
+  __device__ static __half add(__half a, __half b) { return __hadd(a, b); }
+  __device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
+};
+template<>
+struct float_traits<bfloat16> {
+  static constexpr int mantissa_bits = 7;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint16_t;
+  __device__ static bfloat16 make(double x) { return bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
+  __device__ static double todouble(bfloat16 x) { return double(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
+};
+
+template<typename F>
+__device__ int compare(F a, F b) {
+  union { typename float_traits<F>::uint_t ua; F fa; };
+  union { typename float_traits<F>::uint_t ub; F fb; };
+  ua=0; ub=0;
+  fa=a; fb=b;
+  //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
+  return ua < ub ? ub-ua : ua-ub;
+}
+
+struct xoshiro256ss {
+	uint64_t s[4];
+  __device__ xoshiro256ss(int seed) {
+    constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
+    for(int i=0; i < 4; i++)
+      s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
+  }
+  __device__ uint64_t operator()() {
+    auto rol64 = [](uint64_t x, int k) {
+      return (x << k) | (x >> (64 - k));
+    };
+    uint64_t const result = rol64(s[1] * 5, 7) * 9;
+    uint64_t const t = s[1] << 17;
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+    s[2] ^= t;
+    s[3] = rol64(s[3], 45);
+    return result;
+  }
+};
+
+static __device__ int __reduce_max_sync(unsigned int mask, int value)
+{
+  //We ignore mask, since all bits are set when calling them in the
+  //test code below.
+  int width = warpSize;
+  for (unsigned int i = warpSize; i; i >>= 1) {
+    value = max(__shfl_down(value, i, width), value);
+  }
+  return value;
+}
+
+template<typename F>
+__global__ void kernel() {
+  using traits = float_traits<F>;
+  constexpr int samps = 4<<10;
+  __shared__ F accf[samps];
+  __shared__ double accd[samps];
+
+  xoshiro256ss rng(threadIdx.x);
+  float expo_avg = 1;
+  for(int pass=0; pass < 2; pass++) {
+    F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
+    int err_max = 0;
+    float coef = 0;
+    double expo_sum = 0;
+    int expo_n = 0;
+    int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
+    for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
+    //for(int round=0; round < 2; round++) {
+      for(int i=threadIdx.x; i < samps; i += blockDim.x) {
+        accf[i] = (F)0;
+        accd[i] = 0;
+      }
+      __syncthreads();
+      for(int r=0; r < max_ranks; r++) {
+        int err = 0;
+        for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
+          constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
+          double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
+          F f = traits::make(d);
+          accf[i] = traits::add(accf[i], traits::mul(scalar, f));
+          accd[i] += traits::todouble(f);
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
+          int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
+          err = err > e ? err : e;
+        }
+        err = __reduce_max_sync(-1u, err);
+        err_max = err_max > err ? err_max : err;
+        if (r >= 2) {
+          // err = 1 + coef*pow(r,expo)
+          float c = float(err-1)/powf(float(r), expo_avg);
+          coef = coef > c ? coef : c;
+        }
+        if (r >= 2) {
+          double expo = log2f(1+err_max)/log2f(r);
+          expo_sum += expo;
+          expo_n++;
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
+        }
+      }
+    }
+    if(pass==0)
+      expo_avg = expo_sum/expo_n;
+    else if(threadIdx.x == 0)
+      printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+  }
+}
+
+int main() {
+  std::printf("type=float:\n");
+  kernel<float><<<1,32>>>();
+  hipDeviceSynchronize();
+
+  std::printf("\ntype=half:\n");
+  kernel<half><<<1,32>>>();
+  hipDeviceSynchronize();
+
+  std::printf("\ntype=bfloat16:\n");
+  kernel<bfloat16><<<1,32>>>();
+  hipDeviceSynchronize();
+  return 0;
+}
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+//#pragma nv_diag_suppress declared_but_not_referenced
+
+#include "verifiable.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+
+#include "rccl/rccl.h"
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && RCCL_BFLOAT16 ==1
+  #define HAVE_ncclBfloat16 1
+#else
+  #define HAVE_ncclBfloat16 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
+  #define HAVE_ncclAvg 1
+#else
+  #define HAVE_ncclAvg 0
+#endif
+
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
+  #define HAVE_ncclPreMulSum 1
+#else
+  #define HAVE_ncclPreMulSum 0
+#endif
+
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdint>
+#include <cmath>
+#include <unistd.h>
+
+using std::size_t;
+using std::int8_t;
+using std::int16_t;
+using std::int32_t;
+using std::int64_t;
+using std::uint8_t;
+using std::uint16_t;
+using std::uint32_t;
+using std::uint64_t;
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+__device__ unsigned long long bitsOf(T x) {
+  union { unsigned long long ull; T val; } u;
+  u.ull = 0;
+  u.val = x;
+  return u.ull;
+}
+
+__host__ __device__ uint64_t mixBits(uint64_t x) {
+  union { uint32_t u32[2]; uint64_t u64; };
+  u64 = x;
+  u32[1] += 1;
+  u32[0] ^= u32[1];
+  u64 *= 0x9e3779b97f4a7c13u;
+  u32[0] ^= u32[1]<<16 ^ u32[1]>>16;
+  return u64;
+}
+
+__host__ __device__ uint64_t hashOf(uint64_t a, uint64_t b=0) {
+  a += uint64_t(1)<<32;
+  a += b;
+  a ^= a>>32;
+  a *= 0x9e3779b97f4a7c13u;
+  a += b>>16 ^ b<<48;
+  a ^= a>>32;
+  a *= 0xc4ceb9fe1a85ec53u;
+  return a;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct IsIntegral: std::is_integral<T> {};
+template<>
+struct IsIntegral<__half>: std::false_type {};
+#if RCCL_BFLOAT16 == 1
+template<>
+struct IsIntegral<hip_bfloat16>: std::false_type {};
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Hide a value from arithmetic optimizations. Hopefully compiler cannot detect
+// that this is equivalent to the identity function.
+template<typename T>
+__host__ __device__ T inhibit(T x) {
+  union { uint64_t u64; T val; };
+  u64 = 0;
+  val = x;
+  u64 *= 0x0000000100000001u;
+  u64 *= 0xffffffff00000001u;
+  return val;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+  template<typename Y, typename X>
+  __host__ __device__ Y castTo(X x) {
+    return Y(x);
+  }
+  template<typename Y>
+  __host__ __device__ Y castTo(float x) {
+    return Y(x);
+  }
+  template<>
+  __host__ __device__ __half castTo<__half>(float x) {
+    return __float2half(x);
+  }
+  #if RCCL_BFLOAT16 == 1
+  template<>
+  __host__ __device__ hip_bfloat16 castTo<hip_bfloat16>(float x) {
+    return hip_bfloat16(x);
+  }
+  #endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The reduction functions
+
+namespace {
+struct ReduceNil {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T /*b*/) const { return a; }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()+T())>
+  __host__ __device__ T operator()(T a, T b) const { return a + b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+      return __float2half(__half2float(a) + __half2float(b));
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) + static_cast<float>(b));
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceProd {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()*T())>
+  __host__ __device__ T operator()(T a, T b) const { return a * b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+      return __float2half(__half2float(a) * __half2float(b));
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return hip_bfloat16(static_cast<float>(a) * static_cast<float>(b));
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMin {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()<T())>
+  __host__ __device__ T operator()(T a, T b) const { return a < b ? a : b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+    return __half2float(a) < __half2float(b) ? a : b;
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) < static_cast<float>(b) ? a : b;
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReduceMax {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  template<typename T, typename=decltype(T()>T())>
+  __host__ __device__ T operator()(T a, T b) const { return a > b ? a : b; }
+  __host__ __device__ __half operator()(__half a, __half b) const {
+      return __half2float(a) > __half2float(b) ? a : b;
+  }
+  #if RCCL_BFLOAT16 == 1
+  __host__ __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b) const {
+      return static_cast<float>(a) > static_cast<float>(b) ? a : b;
+  }
+  #endif
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+struct ReducePreMulSum {
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceProd()(x, ncclVerifiablePremulScalar<T>(rank_me));
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+template<typename T, bool integral = IsIntegral<T>::value>
+struct ReduceAvg_Base;
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/true> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const { return x; }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x/rank_n; }
+};
+
+template<typename T>
+struct ReduceAvg_Base<T, /*integral=*/false> {
+  int rank_n;
+  __host__ __device__ T preOp(T x, int /*rank_me*/) const {
+    using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+    return ReduceProd()(inhibit(castTo<T>(T1(1)/T1(rank_n))), inhibit(x));
+  }
+  __host__ __device__ T operator()(T a, T b) const { return ReduceSum()(a, b); }
+  __host__ __device__ T postOp(T x) const { return x; }
+};
+
+struct ReduceAvg {
+  int rank_n;
+  template<typename T>
+  __host__ __device__ T preOp(T x, int rank_me) const {
+    return ReduceAvg_Base<T>{rank_n}.preOp(x, rank_me);
+  }
+  template<typename T>
+  __host__ __device__ T operator()(T a, T b) const {
+    return ReduceAvg_Base<T>{rank_n}(a, b);
+  }
+  template<typename T>
+  __host__ __device__ T postOp(T x) const {
+    return ReduceAvg_Base<T>{rank_n}.postOp(x);
+  }
+};
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+template<typename T>
+struct FloatLayout;
+template<>
+struct FloatLayout<float> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 23;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<double> {
+  static constexpr int exponent_bits = 11, mantissa_bits = 52;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+template<>
+struct FloatLayout<__half> {
+  static constexpr int exponent_bits = 5, mantissa_bits = 10;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#if RCCL_BFLOAT16 == 1
+template<>
+struct FloatLayout<hip_bfloat16> {
+  static constexpr int exponent_bits = 8, mantissa_bits = 7;
+  static constexpr int exponent_bias = (1<<(exponent_bits-1))-1;
+};
+#endif
+
+template<typename T>
+__host__ __device__ T makeFloat(int sign, int exp, uint64_t mant) {
+  union { T ans; uint64_t bits; };
+  bits = sign;
+  bits <<= FloatLayout<T>::exponent_bits;
+  bits |= exp;
+  bits <<= FloatLayout<T>::mantissa_bits;
+  bits |= mant;
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// High bits of multiplcation are useful for generating bounded random values
+// from unbounded random values. For instance, given X a totally random 32-bit
+// integer, `umul32hi(X,n)` will be totally random within [0,n).
+__host__ __device__ uint64_t umul32hi(uint32_t a, uint32_t b) {
+#if HIP_VERSION > 50200000
+  return __umulhi(a, b);
+#else
+  return uint64_t(a)*b >> 32;
+#endif
+}
+__host__ __device__ uint64_t umul64hi(uint64_t a, uint64_t b) {
+#if HIP_VERSION > 50200000
+  return __umul64hi(a, b);
+#else
+  return uint64_t(__uint128_t(a)*__uint128_t(b) >> 64);
+#endif
+}
+
+__host__ __device__ int clz32(int x) {
+#if HIP_VERSION > 50200000
+  return __clz(x);
+#else
+  return x==0 ? 32 : __builtin_clz(x);
+#endif
+}
+__host__ __device__ int clz64(long long x) {
+#if HIP_VERSION > 50200000
+  return __clzll(x);
+#else
+  return x==0 ? 64 : __builtin_clzll(x);
+#endif
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+// Returns a wildly permuted rank index. Useful when we know we want exactly N
+// random ranks to exhibit some behavior, we can just test if:
+// `shuffleRank(rank_n, rank_me, rng) < N`. Note that rank_n > 0 must be true
+// for well defined results. This mixes the bits of rng.
+__host__ __device__ int shuffleRank(int rank_n, int rank_me, uint64_t &rng) {
+  uint32_t a = uint32_t(rng);
+  uint32_t b = uint32_t(rng>>32);
+  rng = mixBits(rng);
+
+  uint32_t r = rank_me;
+  // round down rank_n to largest pow2, then subtract 1
+  uint32_t n2 = (~uint32_t(0)>>1) >> clz32(rank_n);
+
+  // These are 1:1 functions modulo 2^n:
+  //   f(x) = x*a + b : for odd a, any b
+  //   f(x) = (x*x + x)/2
+  // So we apply both to the bottom n2+1 ranks, then rotate the top
+  // (rank_n-n2-1) to the bottom and apply both again.
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 ranks
+    r = (r*(a|1) + b) & n2;
+    r = (r*r + r)/2 & n2;
+    // rotate top to bottom
+    r += rank_n - (n2+1);
+  }
+  else
+    r -= n2+1; // rotate top to bottom
+
+  if(r <= n2) {
+    // shuffle bottom n2+1 again
+    r = (r*(b|1) + a) & n2;
+    r = (r*r + r)/2 & n2;
+  }
+  return r;
+}
+}
+
+namespace {
+// Generate wild integers x and y such that if every rank submits its x into a
+// summation the result will be y with y <= y_max. Ranks should be shuffled
+// before calling.
+template<typename Uint>
+__host__ __device__ void genSumXY(
+    int rank_n, int rank_me, uint64_t &rng, Uint y_max, Uint &x, Uint &y,
+    bool avoid_y=false // if true then returned y will not equal given y
+  ) {
+  static_assert(std::is_unsigned<Uint>::value, "Type must be unsigned integral.");
+
+  { // Pick y as a random value in [y_max/2, y_max]
+    Uint d, y_min = (y_max+1)/2;
+    if(8*sizeof(Uint) > 32)
+      d = umul64hi(rng, y_max/2 + (avoid_y ? 0 : 1));
+    else
+      d = umul32hi(uint32_t(rng), y_max/2 + (avoid_y ? 0 : 1));
+    Uint y1 = (avoid_y ? y+1 : y_min) + d;
+    y = y1 - (avoid_y && (y1 < y_min || y_max < y1) ? y_max/2 : 0);
+  }
+  rng = mixBits(rng);
+
+  unsigned r = unsigned(rank_me);
+  unsigned rn = unsigned(rank_n);
+  // Partition our rn ranks into pn distinct subsets each of size rn/pn. If each
+  // rank submits 1+p (where p is 0-based partition index) then the sum be:
+  //   (rn/pn) * pn*(pn+1)/2
+  // So set this equal to our desired sum y and solve for pn.
+  //   (rn/pn) * pn*(pn+1)/2 = y
+  //   rn*(pn+1)/2 = y
+  //   pn = 2*(y/rn)-1
+  Uint pn = rn == 1 ? 1 : 2*(y/rn) - 1;
+  // In the case where rn is huge (compared to y) use only one partition meaning
+  // that all rn ranks will submit 1 (since p=0).
+  pn = pn == 0 ? 1 : pn;
+  // Can't have more partitions than ranks.
+  pn = rn < pn ? rn : pn;
+  // Compute sum of contribution from pn partitions where each submits p+1.
+  Uint p_sum;
+  if(y_max <= ~uint32_t(0)>>1) // compile time known
+    p_sum = Uint(uint32_t(pn)*uint32_t(pn+1)/2);
+  else
+    p_sum = Uint(uint64_t(pn)*uint64_t(pn+1)/2);
+  // Let s be the number of ranks per partition. This is either rn/pn as we
+  // intended, or y/p_sum if that's smaller to prevent overshooting our target y.
+  uint32_t s = y/p_sum < rn/pn ? y/p_sum : rn/pn;
+  x = (s != 0 && r/s < pn) ? 1 + r/s : 0; //  First s*pn ranks contribute partition index +1.
+  x += r == rn-1 ? y - s*p_sum : 0; // Last rank contributes discrepancy.
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    bool same_sign
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int x_sign = y_sign;
+  int xy_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+
+  // If we're using mixed signs then partition into evens and odds.
+  int subrank_n = same_sign ? rank_n : (rank_n+1)/2;
+  int subrank_me = same_sign ? rank_me : rank_me/2;
+  uintmant_t x0_mant, y0_mant;
+  genSumXY(subrank_n, subrank_me, rng, max_mant, x0_mant, y0_mant);
+
+  if (!same_sign && (rank_n+0)/2 != 0) {
+    uintmant_t x1_mant, y1_mant = y0_mant;
+    // Avoid generating y1_mant == y0_mant so we don't have to worry about
+    // signed zero as the result.
+    genSumXY((rank_n+0)/2, rank_me/2, rng, max_mant, x1_mant, y1_mant, /*avoid_y=*/true);
+    y_sign ^= y0_mant < y1_mant ? 1 : 0;
+    y0_mant = (y0_mant < y1_mant ? -1 : 1)*(y0_mant - y1_mant);
+    x_sign ^= rank_me%2;
+    x0_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  }
+
+  uintmant_t ans_mant = input_not_output ? x0_mant : y0_mant;
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = input_not_output ? x_sign : y_sign;
+    int ans_exp = xy_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatPreMulSum(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  constexpr int exp_lo = 1 + FloatLayout<T>::mantissa_bits;
+  constexpr int exp_hi = (1<<FloatLayout<T>::exponent_bits)-1;
+  using uintmant_t = typename std::conditional<(8*sizeof(T) > 32), uint64_t, uint32_t>::type;
+  constexpr uintmant_t mant_mask = (uintmant_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  constexpr uintmant_t max_mant = 2*mant_mask + 1; // add implicit leading 1
+  uint64_t rng = hashOf(seed, index);
+
+  int y_sign = rng & 1;
+  int y_exp = exp_lo + umul32hi(uint32_t(rng>>32), exp_hi-exp_lo);
+  rng = mixBits(rng);
+  int subrank_me0 = shuffleRank((rank_n+1)/2, rank_me/2, rng);
+  int subrank_me1 = shuffleRank((rank_n+0)/2, rank_me/2, rng);
+
+  // when ncclVerifiablePremulScalar() = 1.0 (rank_me%2 == 0)
+  uintmant_t x0_mant, y0_mant;
+  genSumXY((rank_n+1)/2, subrank_me0, rng, max_mant>>1, x0_mant, y0_mant);
+
+  // when ncclVerifiablePremulScalar() = 2.0 (rank_me%2 == 1)
+  uintmant_t x1_mant=0, y1_mant=0;
+  if((rank_n+0)/2 != 0)
+    genSumXY((rank_n+0)/2, subrank_me1, rng, max_mant>>2, x1_mant, y1_mant);
+
+  uintmant_t x_mant = rank_me%2 == 0 ? x0_mant : x1_mant;
+  uintmant_t y_mant = y0_mant + 2*y1_mant;
+  uintmant_t ans_mant = input_not_output ? x_mant : y_mant;
+
+  if(ans_mant == 0)
+    return T(0.0f);
+  else {
+    int shift = clz64(ans_mant) - (64-FloatLayout<T>::mantissa_bits-1);
+    int ans_sign = y_sign;
+    int ans_exp = y_exp - shift;
+    ans_mant <<= shift;
+    return makeFloat<T>(ans_sign, ans_exp, ans_mant & mant_mask);
+  }
+}
+}
+
+namespace {
+template<typename T>
+__host__ __device__ T genInOutFloatProd(
+    bool input_not_output, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  // Three kinds of contributions (values for x):
+  // 1) x = random value: only one rank does this
+  // 2) x = 2^n: random positive n
+  // 3) x = 1
+  // Since only one rank submits a random value, the result of the product
+  // will have the same mantissa as that value but with an exponent incorporating
+  // the sum of the exponents from case (2)
+
+  uint64_t rng = hashOf(seed, index);
+  rank_me = shuffleRank(rank_n, rank_me, rng);
+  int y_sign = (rank_n/2)%2;
+  int x_sign = rank_me%2;
+
+  constexpr unsigned max_exp = -1 + (1<<(FloatLayout<T>::exponent_bits-1));
+  unsigned x_exp=0, y_exp=0;
+  genSumXY(rank_n, rank_me, rng, max_exp, x_exp, y_exp);
+  x_exp += FloatLayout<T>::exponent_bias;
+  y_exp += FloatLayout<T>::exponent_bias;
+
+  constexpr uint64_t mant_mask = (uint64_t(1)<<FloatLayout<T>::mantissa_bits)-1;
+  uint64_t y_mant = rng & mant_mask;
+  if (y_mant == 0) y_mant = 1;
+
+  return makeFloat<T>(
+    input_not_output ? x_sign : y_sign,
+    input_not_output ? x_exp : y_exp,
+    !input_not_output || rank_me==0 ? y_mant : 0
+  );
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// What follows is lots of overloads for genInput/genOutput to generate data
+
+namespace {
+// General case for integral data for all ops but ReduceNil/premulsum
+template<typename T, typename ReduceFn,
+         typename = typename std::enable_if<
+             !std::is_same<ReduceFn, ReduceNil>::value
+           >::type>
+__host__ __device__ void genInput(
+    T &ans, ReduceFn, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = uint64_t(-1)>>(64 - 8*sizeof(T));
+  bits &= hashOf(index ^ index<<16 ^ rank_me, seed);
+  // make sure we never return 0 in products
+  ans = std::is_same<ReduceFn, ReduceProd>::value && bits == 0 ? T(1) : tmp;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Dumb/generic case for genOutput just reduces results of genInput
+
+namespace {
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceFn op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  T acc = genInput<T>(op, rank_n, 0, seed, index);
+  acc = op.preOp(acc, 0);
+  for(int r=1; r < rank_n; r++)
+    acc = op(acc, op.preOp(genInput<T>(op, rank_n, r, seed, index), r));
+  ans = op.postOp(acc);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Nil reduction (byte copy functions). Optimized to assume rank_n=1
+
+namespace {
+template<typename T, bool IsIntegral>
+__host__ __device__ void genInput(
+    T &ans, ReduceNil, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  (void)rank_n, (void)rank_me; // silence unused warnings
+  union { uint64_t bits; T tmp; };
+  bits = mixBits(seed ^ index);
+  bits >>= 64 - 8*sizeof(T);
+  bits &= uint64_t(-1)>>(64 - 8*sizeof(T));
+  ans = tmp;
+}
+
+template<typename T, typename ReduceFn, bool IsIntegral>
+__host__ __device__ void genOutput(
+    T &ans, ReduceNil op, int rank_n, uint64_t seed, intptr_t index,
+    std::integral_constant<bool, IsIntegral>
+  ) {
+  ans = genInput<T>(op, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Sum of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/false);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/false);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Product of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceProd, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceProd, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatProd<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// PreMulSum of int/float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::true_type integral
+  ) {
+  genInput(ans, ReduceSum(), rank_n, rank_me, seed, index, integral);
+}
+
+// No genOutput overload specific to premulsum(int), just use generic case.
+
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReducePreMulSum, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReducePreMulSum, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatPreMulSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index);
+}
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// Average of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/true, rank_n, rank_me, seed, index, /*same_sign=*/true);
+}
+
+template<typename T>
+__host__ __device__ void genOutput(
+    T &ans, ReduceAvg, int rank_n, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  ans = genInOutFloatSum<T>(/*input_not_output=*/false, rank_n, 0, seed, index, /*same_sign=*/true);
+  using T1 = typename std::conditional<(sizeof(T)<sizeof(double)), float, double>::type;
+  //ans = ReduceProd()(ans, T1(1)/T1(rank_n));
+  ans = ReduceProd()(ans, inhibit(castTo<T>(T1(1)/T1(rank_n))));
+ }
+}
+
+/////////////////////////////////////////////////////////////////////////////////
+// min/max of float
+
+namespace {
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMin, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type integral
+  ) {
+  genInput<T>(ans, ReduceMax(), rank_n, rank_me, seed, index, integral);
+}
+template<typename T>
+__host__ __device__ void genInput(
+    T &ans, ReduceMax, int rank_n, int rank_me, uint64_t seed, intptr_t index,
+    std::false_type /*integral*/
+  ) {
+  (void)rank_n; // silence unused warnings
+  constexpr uint64_t mant_mask = (uint64_t(1) << FloatLayout<T>::mantissa_bits)-1;
+  uint64_t rng = hashOf(index ^ index<<16 ^ rank_me, seed);
+  int sign = rng & 1;
+  rng ^= rng>>1;
+  int exp = rng & ((1<<(FloatLayout<T>::exponent_bits-1))-1);
+  exp += 1<<(FloatLayout<T>::exponent_bits-2);
+  rng ^= rng >> FloatLayout<T>::exponent_bits;
+  uint64_t mant = rng & mant_mask;
+  ans = makeFloat<T>(sign, exp, mant);
+}
+
+// No genOutput overload specific to floating point min/max, just use generic case.
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Entry API for genInput/genOutput
+
+namespace {
+template<typename T, typename ReduceFn>
+__host__ __device__ T genInput(
+    ReduceFn op, int rank_n, int rank_me, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genInput(ans, op, rank_n, rank_me, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+
+template<typename T, typename ReduceFn>
+__host__ __device__ T genOutput(
+    ReduceFn op, int rank_n, uint64_t seed, intptr_t index
+  ) {
+  T ans;
+  genOutput(ans, op, rank_n, seed, index,
+    std::integral_constant<bool, IsIntegral<T>::value>());
+  return ans;
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareInput2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genInput<T>(op, rank_n, rank_me, seed, elt_ix0+i);
+    #if 0
+    T output = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    printf("prepareInput2 T=%d seed=0x%llx r=%d ix=%lld x=%g output=%g elts=%p\n",
+      std::is_same<T,int>::value, (long long)seed, int(rank_me), (long long)i, (float)elts[i], (float)output, elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareInput1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareInput2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, rank_me, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(__half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareInput(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareInput1(elts, elt_n, elt_ty, ReduceNil(), rank_n, rank_me, seed, elt_ix0, stream); \
+    else \
+      prepareInput1(elts, elt_n, elt_ty, op, rank_n, rank_me, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T, typename ReduceFn>
+__global__ void prepareExpected2(
+    T *elts, intptr_t elt_n, ReduceFn op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  while(i < i1) {
+    elts[i] = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    #if 0
+    printf("prepareExpected2 seed=0x%llx ix=%lld x=%g elts=%p\n",
+      (long long)seed, (long long)(elt_ix0+i), (float)elts[i], elts);
+    #endif
+    i += blockDim.x;
+  }
+}
+
+template<typename ReduceOp>
+void prepareExpected1(
+    void *elts, intptr_t elt_n, int elt_ty, ReduceOp op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+  #define CASE_TY(T) prepareExpected2<<<block_n, 512, 0, stream>>>((T*)elts, elt_n, op, rank_n, seed, elt_ix0); break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t)
+  case ncclUint8: CASE_TY(uint8_t)
+  case ncclInt32: CASE_TY(int32_t)
+  case ncclUint32: CASE_TY(uint32_t)
+  case ncclInt64: CASE_TY(int64_t)
+  case ncclUint64: CASE_TY(uint64_t)
+  case ncclFloat16: CASE_TY(__half)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16)
+  #endif
+  case ncclFloat32: CASE_TY(float)
+  case ncclFloat64: CASE_TY(double)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+}
+
+void ncclVerifiablePrepareExpected(
+    void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
+    uint64_t seed, intptr_t elt_ix0, hipStream_t stream
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+      prepareExpected1(elts, elt_n, elt_ty, ReduceNil(), rank_n, seed, elt_ix0, stream); \
+    else \
+      prepareExpected1(elts, elt_n, elt_ty, op, rank_n, seed, elt_ix0, stream); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace {
+/* How we compare floating point values when exactness is impossible is interesting.
+ * First, we take note that simply reinterpreting integer bits as floating point
+ * gives us a monotonic mapping which exponentially spaces out floats. Thus
+ * consecutive integers encode consecutive floats. In general, using integer
+ * subraction on the bitpatterns of two floats gives us an integer which is the
+ * logarithm of their relative difference. But, if the floats always have similar
+ * exponents, than the integer difference is actually proportional to the
+ * relative error (this is because we are counting hops in the mantissa bits only,
+ * not the exponent bits). So a cheap way to compare if two floats are relatively
+ * close is: abs(intBits(a), intBits(b)) < tolerance. The following formula
+ * calculates such a tolerance for a summation of n floats. This formula
+ * was derived by inspecting the maximum observed integer difference over many
+ * random runs of summation. The parameter values were computed by the
+ * companion program "inexact_regress.cu".
+ */
+__host__ __device__ unsigned calcSumFloatTolerance(int rank_n, int elt_ty) {
+  float power, coef;
+  switch(elt_ty) {
+  case ncclFloat32:
+  case ncclFloat64:
+    power = .51f;
+    coef = 1.25f;
+    break;
+  case ncclFloat16:
+    power = .91f;
+    coef = .75f;
+    break;
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16:
+    power = .91f;
+    coef = .66f;
+    break;
+  #endif
+  }
+  #if __CUDA_ARCH__
+    return 1 + unsigned(coef*powf(float(rank_n), power));
+  #else
+    return 1 + unsigned(coef*std::pow(float(rank_n), power));
+  #endif
+}
+
+template<typename T>
+__host__ __device__  uint64_t calcDelta(T a, T b) {
+  union { T t; uint8_t i1; uint16_t i2; uint32_t i4; uint64_t i8; } x, y;
+  x.t = a;
+  y.t = b;
+  switch(sizeof(T)) {
+  case 1:  return x.i1 < y.i1 ? y.i1 - x.i1 : x.i1 - y.i1;
+  case 2:  return x.i2 < y.i2 ? y.i2 - x.i2 : x.i2 - y.i2;
+  case 4:  return x.i4 < y.i4 ? y.i4 - x.i4 : x.i4 - y.i4;
+  default: return x.i8 < y.i8 ? y.i8 - x.i8 : x.i8 - y.i8;
+  }
+}
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if !SELF_TEST
+namespace {
+template<typename T>
+__global__ void verifyPrepared(
+    T const *results, T const *expected, intptr_t elt_n, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    T a = results[i], b = expected[i];
+    T delta = a < b ? b - a : a - b;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      if(tolerance < delta) {
+        printf("verifyPrepared ix=%lld got=%g exp=%g\n", (long long)i, (float)results[i], (float)expected[i]);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long *)bad_elt_n, (unsigned long)bad);
+}
+
+template<typename T, typename Uint, typename ReduceFn>
+__global__ void verifyInline2(
+    T const *results, intptr_t elt_n, ReduceFn op, int rank_n, uint64_t seed,
+    intptr_t elt_ix0, unsigned tolerance, int64_t *bad_elt_n
+  ) {
+  intptr_t i0 = blockIdx.x*(elt_n/gridDim.x);
+  i0 += blockIdx.x < elt_n%gridDim.x ? blockIdx.x : elt_n%gridDim.x;
+  intptr_t i1 = (blockIdx.x+1)*(elt_n/gridDim.x);
+  i1 += blockIdx.x+1 < elt_n%gridDim.x ? blockIdx.x+1 : elt_n%gridDim.x;
+  intptr_t i = i0 + threadIdx.x;
+  int64_t bad = 0;
+
+  while(i < i1) {
+    union { T t; Uint u; } a, b;
+    a.t = results[i];
+    b.t = genOutput<T>(op, rank_n, seed, elt_ix0+i);
+    Uint delta = a.u < b.u ? b.u - a.u : a.u - b.u;
+    bad += tolerance < delta ? 1 : 0;
+    #if 0
+      T input = genInput<T>(op, rank_n, 0, seed, elt_ix0+i);
+      if(tolerance < delta) {
+        printf("verifyInline2 fail T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      } else {
+        printf("verifyInline2 pass T=%d ix=%lld got=%g exp=%g input=%g\n",
+          std::is_same<T,int>::value, (long long)i, (float)a.t, (float)b.t, (float)input);
+      }
+    #endif
+    i += blockDim.x;
+  }
+  //asm volatile("red.global.add.u64 [%0],%1;" :: "l"(bad_elt_n), "l"(bad));
+  atomicAdd((unsigned long*)bad_elt_n, (unsigned long)bad);
+}
+
+template<typename T, typename Uint>
+void verifyInline1(
+    T const *results, intptr_t elt_n, int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    unsigned tolerance, int64_t *bad_elt_n, hipStream_t stream, int block_n
+  ) {
+  #define CASE_OP(op) \
+    if(rank_n == 1) \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, ReduceNil(), rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    else \
+    verifyInline2<T, Uint><<<block_n, 512, 0, stream>>> \
+      ((T const*)results, elt_n, op, rank_n, seed, elt_ix0, tolerance, bad_elt_n); \
+    break;
+  switch(red_op) {
+  case ncclSum: CASE_OP(ReduceSum())
+  case ncclMin: CASE_OP(ReduceMin())
+  case ncclMax: CASE_OP(ReduceMax())
+  case ncclProd: CASE_OP(ReduceProd())
+  #if HAVE_ncclAvg
+  case ncclAvg: CASE_OP(ReduceAvg{rank_n})
+  #endif
+  #if HAVE_ncclPreMulSum
+  default: CASE_OP(ReducePreMulSum())
+  #endif
+  }
+  #undef CASE_OP
+}
+}
+
+void ncclVerifiableVerify(
+    void const *results, void const *expected, intptr_t elt_n, int elt_ty,
+    int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
+    int64_t *bad_elt_n, hipStream_t stream
+  ) {
+  bool floating = elt_ty == ncclFloat16 || elt_ty == ncclFloat32 || elt_ty == ncclFloat64;
+  #if HAVE_ncclBfloat16
+    floating |= elt_ty == ncclBfloat16;
+  #endif
+
+  unsigned tolerance = 0;
+  #if HAVE_ncclAvg
+  if (floating && red_op == ncclAvg)
+    tolerance = calcSumFloatTolerance(rank_n, elt_ty);
+  #endif
+
+  int block_n = std::min<intptr_t>(32, (elt_n + 4*512-1)/(4*512));
+
+  *bad_elt_n = 0;
+  #define CASE_TY(T, Uint) { \
+      if(expected != nullptr) { \
+        verifyPrepared<<<block_n, 512, 0, stream>>>((Uint const*)results, (Uint const*)expected, elt_n, tolerance, bad_elt_n); \
+      } else { \
+        verifyInline1<T, Uint>((T const*)results, elt_n, red_op, rank_n, seed, elt_ix0, tolerance, bad_elt_n, stream, block_n); \
+      } \
+    } break;
+  switch(elt_ty) {
+  case ncclInt8: CASE_TY(int8_t, uint8_t)
+  case ncclUint8: CASE_TY(uint8_t, uint8_t)
+  case ncclInt32: CASE_TY(int32_t, uint32_t)
+  case ncclUint32: CASE_TY(uint32_t, uint32_t)
+  case ncclInt64: CASE_TY(int64_t, uint64_t)
+  case ncclUint64: CASE_TY(uint64_t, uint64_t)
+  case ncclFloat16: CASE_TY(__half, uint16_t)
+  #if HAVE_ncclBfloat16
+  case ncclBfloat16: CASE_TY(hip_bfloat16, uint16_t)
+  #endif
+  case ncclFloat32: CASE_TY(float, uint32_t)
+  case ncclFloat64: CASE_TY(double, uint64_t)
+  default: assert(0);
+  }
+  #undef CASE_TY
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+#if SELF_TEST
+#include <iostream>
+
+template<typename T, typename Op>
+__device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
+  //if(!std::is_same<T,half>::value) return;
+  //if(!std::is_same<Op,ReduceProd>::value) return;
+  //if(rank_n!=3) return;
+
+  unsigned tolerance = !IsIntegral<T>::value && std::is_same<Op,ReduceAvg>::value ? calcSumFloatTolerance(rank_n, ty) : 0;
+  uint64_t seed = 0xc8e2bed69766d533;
+
+  for(int ix=threadIdx.x; ix < 10000; ix+=blockDim.x) {
+    //if(ix!=387) continue;
+    T y = genOutput<T>(op, rank_n, seed, ix);
+    T sum;
+    for(int r=0; r < rank_n; r++) {
+      T x = genInput<T>(op, rank_n, r, seed, ix);
+      x = op.preOp(x, r);
+      sum = r==0 ? x : op(sum, inhibit(x));
+      //std::printf("x = %llx, sum = %llx\n", bitsOf(x), bitsOf(sum));
+    }
+    sum = op.postOp(sum);
+    if(tolerance < calcDelta(sum, y)) {
+      printf(
+        //"%10g != %10g  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        "%llx != %llx  :  T=%-8s op=%-9s rank_n=%-1d ix=%-1d\n",
+        *(long long*)&sum, *(long long*)&y, tyname, opname, rank_n, ix
+      );
+    }
+  }
+}
+
+template<typename T>
+__device__ void sweep1(int ty, char const *tyname) {
+  for(int i=0; i < 10; i++) {
+    int rank_n = (1<<i) + i;
+    sweep2<T>(ty, tyname, ReduceSum(), "sum", rank_n);
+    sweep2<T>(ty, tyname, ReduceProd(), "prod", rank_n);
+    sweep2<T>(ty, tyname, ReduceMin(), "min", rank_n);
+    sweep2<T>(ty, tyname, ReduceMax(), "max", rank_n);
+    sweep2<T>(ty, tyname, ReducePreMulSum(), "premulsum", rank_n);
+    sweep2<T>(ty, tyname, ReduceAvg{rank_n}, "avg", rank_n);
+  }
+}
+
+__global__ void sweep() {
+  sweep1<int8_t>(ncclInt8, "int8");
+  sweep1<uint8_t>(ncclUint8, "uint8");
+  sweep1<int32_t>(ncclInt32, "int32");
+  sweep1<uint32_t>(ncclUint32, "uint32");
+  sweep1<int64_t>(ncclInt64, "int64");
+  sweep1<uint64_t>(ncclUint64, "uint64");
+  sweep1<__half>(ncclFloat16, "half");
+  #if HAVE_ncclBfloat16
+    sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
+  #endif
+  sweep1<float>(ncclFloat32, "float");
+  sweep1<double>(ncclFloat64, "double");
+}
+
+int main(int arg_n, char **args) {
+  std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
+  hipSetDevice(0);
+  sweep<<<1,512>>>();
+  hipDeviceSynchronize();
+  return 0;
+}
+#endif