Initial commit

d8ca0a9e · jerrrrry · d8ca0a9e · d8ca0a9e · d8ca0a9e · d8ca0a9e
Commit d8ca0a9e authored Jul 18, 2025 by jerrrrry
20 changed files
--- a/src/gather.cu
+++ b/src/gather.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "hip/hip_runtime.h"
+#include "common.h"
+
+void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count/nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = count/nranks;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks;
+}
+
+testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+      HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
+      if (rank == root) {
+	for (int j=0; j<nranks; j++) {
+	  TESTCHECK(InitData(((char*)args->expected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
+	}
+      }
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl gatherTest = {
+  "Gather",
+  GatherGetCollByteCount,
+  GatherInitData,
+  GatherGetBw,
+  GatherRunColl
+};
+
+void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &gatherTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  GatherGetBuffSize,
+  GatherRunTest
+};
--- a/src/hypercube.cu
+++ b/src/hypercube.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include "hip/hip_runtime.h"
+#include "common.h"
+
+#define ALIGN 4
+
+void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
+  *recvInplaceOffset = 0;
+  *paramcount = base;
+}
+
+testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+      for (int j=0; j<nranks; j++) {
+	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
+      }
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  char* sbuff = (char*)sendbuff;
+  char* rbuff = (char*)recvbuff;
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankSize = count * wordSize(type);
+  if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
+
+  // Hypercube AllGather
+  for (int mask=1; mask<nRanks; mask<<=1) {
+    NCCLCHECK(ncclGroupStart());
+    int s = rank & ~(mask-1);
+    int r = s ^ mask;
+    NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
+    NCCLCHECK(ncclGroupEnd());
+  }
+  return testSuccess;
+}
+
+struct testColl hyperCubeTest = {
+  "HyperCube",
+  HyperCubeGetCollByteCount,
+  HyperCubeInitData,
+  HyperCubeGetBw,
+  HyperCubeRunColl
+};
+
+void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &hyperCubeTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  // Check if this is a power of 2
+  int nRanks = args->nProcs*args->nThreads*args->nGpus;
+  if (nRanks && !(nRanks & (nRanks - 1))) {
+    for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+    }
+  } else {
+    printf("nRanks %d is not a power of 2, skipping\n", nRanks);
+  }
+
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  HyperCubeGetBuffSize,
+  HyperCubeRunTest
+};
--- a/src/nccl1_compat.h
+++ b/src/nccl1_compat.h
+/*************************************************************************
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL1_COMPAT_H
+#define NCCL1_COMPAT_H
+
+#ifndef NCCL_MAJOR // NCCL 1.x
+#define NCCL_MAJOR 1
+#define NCCL_MINOR 0
+
+#define ncclNumOps nccl_NUM_OPS
+#define ncclNumTypes nccl_NUM_TYPES
+
+static ncclResult_t ncclGroupStart() { return ncclSuccess; }
+static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
+
+#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
+
+static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
+}
+static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
+}
+static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(count);
+  return ncclBcast(buff, (int)count, datatype, root, comm, stream);
+}
+static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    hipStream_t stream) {
+  CHECKCOUNT(recvcount);
+  return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
+}
+static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
+  CHECKCOUNT(sendcount);
+  return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
+}
+#endif
+
+#endif
--- a/src/rccl_bfloat16.h
+++ b/src/rccl_bfloat16.h
+/**
+ * MIT License
+ *
+ * Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*!\file
+ * \brief rccl_bfloat16.h provides struct for rccl_bfloat16 typedef
+ */
+
+#ifndef _RCCL_BFLOAT16_H_
+#define _RCCL_BFLOAT16_H_
+
+#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
+
+// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
+// include a minimal definition of rccl_bfloat16
+
+#include <stdint.h>
+/*! \brief Struct to represent a 16 bit brain floating point number. */
+typedef struct
+{
+    uint16_t data;
+} rccl_bfloat16;
+
+#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <hip/hip_runtime.h>
+#include <ostream>
+#include <type_traits>
+
+struct rccl_bfloat16
+{
+    uint16_t data;
+
+    enum truncate_t
+    {
+        truncate
+    };
+
+    __host__ __device__ rccl_bfloat16() = default;
+
+    // round upper 16 bits of IEEE float to convert to bfloat16
+    explicit __host__ __device__ rccl_bfloat16(float f)
+        : data(float_to_bfloat16(f))
+    {
+    }
+
+    explicit __host__ __device__ rccl_bfloat16(float f, truncate_t)
+        : data(truncate_float_to_bfloat16(f))
+    {
+    }
+
+    // zero extend lower 16 bits of bfloat16 to convert to IEEE float
+    __host__ __device__ operator float() const
+    {
+        union
+        {
+            uint32_t int32;
+            float    fp32;
+        } u = {uint32_t(data) << 16};
+        return u.fp32;
+    }
+
+private:
+    static __host__ __device__ uint16_t float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        if(~u.int32 & 0x7f800000)
+        {
+            // When the exponent bits are not all 1s, then the value is zero, normal,
+            // or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
+            // 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
+            // This causes the bfloat16's mantissa to be incremented by 1 if the 16
+            // least significant bits of the float mantissa are greater than 0x8000,
+            // or if they are equal to 0x8000 and the least significant bit of the
+            // bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
+            // the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
+            // has the value 0x7f, then incrementing it causes it to become 0x00 and
+            // the exponent is incremented by one, which is the next higher FP value
+            // to the unrounded bfloat16 value. When the bfloat16 value is subnormal
+            // with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
+            // to a normal value with an exponent of 0x01 and a mantissa of 0x00.
+            // When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
+            // incrementing it causes it to become an exponent of 0xFF and a mantissa
+            // of 0x00, which is Inf, the next higher value to the unrounded value.
+            u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
+        }
+        else if(u.int32 & 0xffff)
+        {
+            // When all of the exponent bits are 1, the value is Inf or NaN.
+            // Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
+            // mantissa bit. Quiet NaN is indicated by the most significant mantissa
+            // bit being 1. Signaling NaN is indicated by the most significant
+            // mantissa bit being 0 but some other bit(s) being 1. If any of the
+            // lower 16 bits of the mantissa are 1, we set the least significant bit
+            // of the bfloat16 mantissa, in order to preserve signaling NaN in case
+            // the bloat16's mantissa bits are all 0.
+            u.int32 |= 0x10000; // Preserve signaling NaN
+        }
+        return uint16_t(u.int32 >> 16);
+    }
+
+    // Truncate instead of rounding, preserving SNaN
+    static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f)
+    {
+        union
+        {
+            float    fp32;
+            uint32_t int32;
+        } u = {f};
+        return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
+    }
+};
+
+typedef struct
+{
+    uint16_t data;
+} rccl_bfloat16_public;
+
+static_assert(std::is_standard_layout<rccl_bfloat16>{},
+              "rccl_bfloat16 is not a standard layout type, and thus is "
+              "incompatible with C.");
+
+static_assert(std::is_trivial<rccl_bfloat16>{},
+              "rccl_bfloat16 is not a trivial type, and thus is "
+              "incompatible with C.");
+
+static_assert(sizeof(rccl_bfloat16) == sizeof(rccl_bfloat16_public)
+                  && offsetof(rccl_bfloat16, data) == offsetof(rccl_bfloat16_public, data),
+              "internal rccl_bfloat16 does not match public rccl_bfloat16");
+
+inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat16& bf16)
+{
+    return os << float(bf16);
+}
+inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a)
+{
+    return a;
+}
+inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a)
+{
+    a.data ^= 0x8000;
+    return a;
+}
+inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) + float(b));
+}
+inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) - float(b));
+}
+inline __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) * float(b));
+}
+inline __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return rccl_bfloat16(float(a) / float(b));
+}
+inline __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return float(a) < float(b);
+}
+inline __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return float(a) == float(b);
+}
+inline __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return b < a;
+}
+inline __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return !(a > b);
+}
+inline __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return !(a == b);
+}
+inline __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b)
+{
+    return !(a < b);
+}
+inline __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a + b;
+}
+inline __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a - b;
+}
+inline __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a * b;
+}
+inline __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b)
+{
+    return a = a / b;
+}
+inline __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a)
+{
+    return a += rccl_bfloat16(1.0f);
+}
+inline __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a)
+{
+    return a -= rccl_bfloat16(1.0f);
+}
+inline __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int)
+{
+    rccl_bfloat16 orig = a;
+    ++a;
+    return orig;
+}
+inline __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int)
+{
+    rccl_bfloat16 orig = a;
+    --a;
+    return orig;
+}
+
+namespace std
+{
+    constexpr __host__ __device__ bool isinf(rccl_bfloat16 a)
+    {
+        return !(~a.data & 0x7f80) && !(a.data & 0x7f);
+    }
+    constexpr __host__ __device__ bool isnan(rccl_bfloat16 a)
+    {
+        return !(~a.data & 0x7f80) && +(a.data & 0x7f);
+    }
+    constexpr __host__ __device__ bool iszero(rccl_bfloat16 a)
+    {
+        return !(a.data & 0x7fff);
+    }
+    inline rccl_bfloat16 sin(rccl_bfloat16 a)
+    {
+        return rccl_bfloat16(sinf(float(a)));
+    }
+    inline rccl_bfloat16 cos(rccl_bfloat16 a)
+    {
+        return rccl_bfloat16(cosf(float(a)));
+    }
+}
+
+#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
+
+#endif // _RCCL_BFLOAT16_H_
--- a/src/reduce.cu
+++ b/src/reduce.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+      HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
+      if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  *busBw = baseBw;
+}
+
+testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
+  return testSuccess;
+}
+
+struct testColl reduceTest = {
+  "Reduce",
+  ReduceGetCollByteCount,
+  ReduceInitData,
+  ReduceGetBw,
+  ReduceRunColl
+};
+
+void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      for (int k=begin_root; k<=end_root; k++) {
+        TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
+      }
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ReduceGetBuffSize,
+  ReduceRunTest
+};
--- a/src/reduce_scatter.cu
+++ b/src/reduce_scatter.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+#define ALIGN 4
+
+void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base*nranks;
+  *recvcount = base;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = base;
+  *paramcount = base;
+}
+
+testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+      HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
+      TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+
+testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+
+struct testColl reduceScatterTest = {
+  "ReduceScatter",
+  ReduceScatterGetCollByteCount,
+  ReduceScatterInitData,
+  ReduceScatterGetBw,
+  ReduceScatterRunColl
+};
+
+void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &reduceScatterTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    run_ops = &op;
+    run_opnames = &opName;
+    op_count = 1;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ReduceScatterGetBuffSize,
+  ReduceScatterRunTest
+};
--- a/src/scatter.cu
+++ b/src/scatter.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = count/nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = count/nranks;
+  *paramcount = count/nranks;
+}
+
+testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+      TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
+      k++;
+
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+
+void ScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+
+testResult_t ScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  size_t rankOffset = count * wordSize(type);
+  if (count == 0) return testSuccess;
+
+  NCCLCHECK(ncclGroupStart());
+  if (rank == root) {
+    for (int r=0; r<nRanks; r++) {
+      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, type, r, comm, stream));
+    }
+  }
+  NCCLCHECK(ncclRecv(recvbuff, count, type, root, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+
+  return testSuccess;
+}
+
+struct testColl scatterTest = {
+  "Scatter",
+  ScatterGetCollByteCount,
+  ScatterInitData,
+  ScatterGetBw,
+  ScatterRunColl
+};
+
+void ScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  ScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &scatterTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  ScatterGetBuffSize,
+  ScatterRunTest
+};
--- a/src/sendrecv.cu
+++ b/src/sendrecv.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#include <hip/hip_runtime.h>
+#include "common.h"
+
+void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+
+testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
+      int peer = (rank-1+nranks)%nranks;
+      TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  // We don't support in-place sendrecv
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+
+void SendRecvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+
+testResult_t SendRecvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  int recvPeer = (rank-1+nRanks) % nRanks;
+  int sendPeer = (rank+1) % nRanks;
+
+  NCCLCHECK(ncclGroupStart());
+  NCCLCHECK(ncclSend(sendbuff, count, type, sendPeer, comm, stream));
+  NCCLCHECK(ncclRecv(recvbuff, count, type, recvPeer, comm, stream));
+  NCCLCHECK(ncclGroupEnd());
+  return testSuccess;
+}
+
+struct testColl sendRecvTest = {
+  "SendRecv",
+  SendRecvGetCollByteCount,
+  SendRecvInitData,
+  SendRecvGetBw,
+  SendRecvRunColl
+};
+
+void SendRecvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  SendRecvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+
+testResult_t SendRecvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &sendRecvTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+
+struct testEngine ncclTestEngine = {
+  SendRecvGetBuffSize,
+  SendRecvRunTest
+};
--- a/src/timer.cc
+++ b/src/timer.cc
+#include "timer.h"
+
+// Make sure to compile this translation unit with the host compiler and not
+// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
+#include <chrono>
+
+namespace {
+  std::uint64_t now() {
+    using clock = std::chrono::steady_clock;
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
+  }
+}
+
+timer::timer() {
+  t0 = now();
+}
+
+double timer::elapsed() const {
+  std::uint64_t t1 = now();
+  return 1.e-9*(t1 - t0);
+}
+
+double timer::reset() {
+  std::uint64_t t1 = now();
+  double ans = 1.e-9*(t1 - t0);
+  t0 = t1;
+  return ans;
+}
--- a/src/timer.h
+++ b/src/timer.h
+#ifndef _408319ecdd5b47b28bf8f511c4fdf816
+#define _408319ecdd5b47b28bf8f511c4fdf816
+
+#include <cstdint>
+
+// Can't include <chrono> because of bug with gcc 10.3.0
+class timer {
+  std::uint64_t t0;
+public:
+  timer();
+  double elapsed() const;
+  double reset();
+};
+
+#endif
--- a/test/__init__.py
+++ b/test/__init__.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
\ No newline at end of file
--- a/test/conftest.py
+++ b/test/conftest.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+def pytest_addoption(parser):
+    parser.addoption("--hostfile", action="store", default="", help="specify MPI hostfile")
\ No newline at end of file
--- a/test/test_AllGather.py
+++ b/test/test_AllGather.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/all_gather_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllGather test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllGather test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/test/test_AllReduce.py
+++ b/test/test_AllReduce.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/all_reduce_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_AllReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllReduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_AllReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("AllReduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/test/test_Broadcast.py
+++ b/test/test_Broadcast.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/broadcast_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_BroadcastSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Broadcast test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_BroadcastMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Broadcast test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/test/test_Reduce.py
+++ b/test/test_Reduce.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/reduce_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_ReduceSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Reduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_ReduceMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("Reduce test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/test/test_ReduceScatter.py
+++ b/test/test_ReduceScatter.py
+#################################################################################
+# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+# ies of the Software, and to permit persons to whom the Software is furnished
+# to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+################################################################################
+
+import os
+import subprocess
+import itertools
+
+import pytest
+
+nthreads = ["1"]
+nprocs = ["2"]
+ngpus_single = ["1","2","4"]
+ngpus_mpi = ["1","2"]
+byte_range = [("4", "128M")]
+op = ["sum", "prod", "min", "max"]
+step_factor = ["2"]
+datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
+memory_type = ["coarse","fine", "host"]
+
+path = os.path.dirname(os.path.abspath(__file__))
+executable = path + "/../build/reduce_scatter_perf"
+
+@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
+    itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
+def test_ReduceScatterSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
+    try:
+        args = [executable,
+                "-t", nthreads,
+                "-g", ngpus_single,
+                "-b", byte_range[0],
+                "-e", byte_range[1],
+                "-o", op,
+                "-f", step_factor,
+                "-d", datatype,
+                "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("ReduceScatter test error(s) detected.")
+
+    assert rccl_test.returncode == 0
+
+@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
+    itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
+def test_ReduceScatterMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
+    try:
+        mpi_hostfile = request.config.getoption('--hostfile')
+        if not mpi_hostfile:
+            args = ["mpirun -np", nprocs,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype]
+        else:
+            args = ["mpirun -np", nprocs,
+                    "-host", mpi_hostfile,
+                    executable,
+                    "-p 1",
+                    "-t", nthreads,
+                    "-g", ngpus_mpi,
+                    "-b", byte_range[0],
+                    "-e", byte_range[1],
+                    "-o", op,
+                    "-f", step_factor,
+                    "-d", datatype,
+                    "-y", memory_type]
+        if memory_type == "fine":
+            args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
+        args_str = " ".join(args)
+        print(args_str)
+        rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
+    except subprocess.CalledProcessError as err:
+        print(rccl_test.stdout)
+        pytest.fail("ReduceScatter test error(s) detected.")
+
+    assert rccl_test.returncode == 0
\ No newline at end of file
--- a/verifiable/Makefile
+++ b/verifiable/Makefile
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+
+#include ../../makefiles/common.mk
+
+.PHONY: all clean
+
+BUILDDIR := $(abspath ../../build)
+DST_DIR := $(BUILDDIR)/test/verifiable
+
+ROCM_PATH ?= /opt/rocm
+MPI_HOME ?= /usr/lib/openmpi
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+NCCL_HOME ?= ""
+
+HIPCC = $(ROCM_PATH)/bin/hipcc
+CXX = $(HIPCC)
+
+HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
+endif
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+
+ifeq ($(DEBUG), 0)
+HIPCUFLAGS += -O3
+else
+HIPCUFLAGS += -O0 -g -ggdb3
+endif
+
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+
+ifeq ($(MPI), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+else ifeq ($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L/usr/lib -lmpich
+endif
+
+LIBRARIES += rccl
+HIPLDFLAGS   += $(LIBRARIES:%=-l%)
+
+all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test 
+
+clean:
+	rm -rf $(DST_DIR)
+
+TEST_VERIFIABLE_SRCDIR := .
+TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
+include verifiable.mk
+
+self_test: $(DST_DIR)/self_test
+
+$(DST_DIR)/self_test: verifiable.cu verifiable.h
+	@printf "Linking  %s\n" $@
+	@mkdir -p $(DST_DIR)
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
--- a/verifiable/inexact_regress.cu
+++ b/verifiable/inexact_regress.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+/* Generate parameters for our error bound model of floating point average
+ * (sum of scaled values) by sampling sums of random sequences for each
+ * floating point type.
+ *
+ * The model has parameters "coef" and "power", where for two floats a & b,
+ * they are close enough if and only if:
+ *   abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
+ *
+ * Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
+ *
+ * Compile with:
+ *   nvcc -gencode=arch=compute_80,code=sm_80
+ */
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+#include <hip/hip_bfloat16.h>
+#include <hip/hip_fp16.h>
+
+using std::uint64_t;
+using std::uint32_t;
+using bfloat16 = hip_bfloat16;
+
+template<typename T>
+struct float_traits;
+
+template<>
+struct float_traits<float> {
+  static constexpr int mantissa_bits = 23;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint32_t;
+  __device__ static float make(double x) { return (float)x; }
+  __device__ static float make(uint64_t x) { return (float)x; }
+  __device__ static double todouble(float x) { return x; }
+  __device__ static float add(float a, float b) { return a+b; }
+  __device__ static float mul(float a, float b) { return a*b; }
+};
+template<>
+struct float_traits<double> {
+  static constexpr int mantissa_bits = 52;
+  static constexpr int exponent_bits = 11;
+  using uint_t = uint64_t;
+  __device__ static double make(double x) { return x; }
+  __device__ static double make(uint64_t x) { return (double)x; }
+  __device__ static double todouble(double x) { return x; }
+  __device__ static double add(double a, double b) { return a+b; }
+  __device__ static double mul(double a, double b) { return a*b; }
+};
+template<>
+struct float_traits<__half> {
+  static constexpr int mantissa_bits = 10;
+  static constexpr int exponent_bits = 5;
+  using uint_t = uint16_t;
+  __device__ static __half make(double x) { return __float2half((float)x); }
+  __device__ static __half make(uint64_t x) { return __int2half_rn(x); }
+  __device__ static double todouble(__half x) { return __half2float(x); }
+  __device__ static __half add(__half a, __half b) { return __hadd(a, b); }
+  __device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
+};
+template<>
+struct float_traits<bfloat16> {
+  static constexpr int mantissa_bits = 7;
+  static constexpr int exponent_bits = 8;
+  using uint_t = uint16_t;
+  __device__ static bfloat16 make(double x) { return bfloat16(x); }
+  __device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
+  __device__ static double todouble(bfloat16 x) { return double(x); }
+  __device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
+  __device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
+};
+
+template<typename F>
+__device__ int compare(F a, F b) {
+  union { typename float_traits<F>::uint_t ua; F fa; };
+  union { typename float_traits<F>::uint_t ub; F fb; };
+  ua=0; ub=0;
+  fa=a; fb=b;
+  //std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
+  return ua < ub ? ub-ua : ua-ub;
+}
+
+struct xoshiro256ss {
+	uint64_t s[4];
+  __device__ xoshiro256ss(int seed) {
+    constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
+    for(int i=0; i < 4; i++)
+      s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
+  }
+  __device__ uint64_t operator()() {
+    auto rol64 = [](uint64_t x, int k) {
+      return (x << k) | (x >> (64 - k));
+    };
+    uint64_t const result = rol64(s[1] * 5, 7) * 9;
+    uint64_t const t = s[1] << 17;
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+    s[2] ^= t;
+    s[3] = rol64(s[3], 45);
+    return result;
+  }
+};
+
+static __device__ int __reduce_max_sync(unsigned int mask, int value)
+{
+  //We ignore mask, since all bits are set when calling them in the
+  //test code below.
+  int width = warpSize;
+  for (unsigned int i = warpSize; i; i >>= 1) {
+    value = max(__shfl_down(value, i, width), value);
+  }
+  return value;
+}
+
+template<typename F>
+__global__ void kernel() {
+  using traits = float_traits<F>;
+  constexpr int samps = 4<<10;
+  __shared__ F accf[samps];
+  __shared__ double accd[samps];
+
+  xoshiro256ss rng(threadIdx.x);
+  float expo_avg = 1;
+  for(int pass=0; pass < 2; pass++) {
+    F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
+    int err_max = 0;
+    float coef = 0;
+    double expo_sum = 0;
+    int expo_n = 0;
+    int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
+    for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
+    //for(int round=0; round < 2; round++) {
+      for(int i=threadIdx.x; i < samps; i += blockDim.x) {
+        accf[i] = (F)0;
+        accd[i] = 0;
+      }
+      __syncthreads();
+      for(int r=0; r < max_ranks; r++) {
+        int err = 0;
+        for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
+          constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
+          double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
+          F f = traits::make(d);
+          accf[i] = traits::add(accf[i], traits::mul(scalar, f));
+          accd[i] += traits::todouble(f);
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
+          int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
+          err = err > e ? err : e;
+        }
+        err = __reduce_max_sync(-1u, err);
+        err_max = err_max > err ? err_max : err;
+        if (r >= 2) {
+          // err = 1 + coef*pow(r,expo)
+          float c = float(err-1)/powf(float(r), expo_avg);
+          coef = coef > c ? coef : c;
+        }
+        if (r >= 2) {
+          double expo = log2f(1+err_max)/log2f(r);
+          expo_sum += expo;
+          expo_n++;
+          //if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
+        }
+      }
+    }
+    if(pass==0)
+      expo_avg = expo_sum/expo_n;
+    else if(threadIdx.x == 0)
+      printf("  coef=%1.10f expo=%1.10f\n", coef, expo_avg);
+  }
+}
+
+int main() {
+  std::printf("type=float:\n");
+  kernel<float><<<1,32>>>();
+  hipDeviceSynchronize();
+
+  std::printf("\ntype=half:\n");
+  kernel<half><<<1,32>>>();
+  hipDeviceSynchronize();
+
+  std::printf("\ntype=bfloat16:\n");
+  kernel<bfloat16><<<1,32>>>();
+  hipDeviceSynchronize();
+  return 0;
+}
--- a/verifiable/verifiable.cu
+++ b/verifiable/verifiable.cu