Initial commit

d8ca0a9e · jerrrrry · d8ca0a9e · d8ca0a9e · d8ca0a9e · d8ca0a9e
Commit d8ca0a9e authored Jul 18, 2025 by jerrrrry
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# ########################################################################
+# Copyright 2022 Advanced Micro Devices, Inc.
+# ########################################################################
+#Adding pthread flag for linking
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
+    find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
+    if (MPI_MPICXX)
+        message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
+        find_file(MPI_H mpi.h PATHS ${mpi_inc_dir} NO_DEFAULT_PATH)
+        message ("-- mpi.h is in ${MPI_H}")
+        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_lib_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu NO_DEFAULT_PATH)
+        message ("-- libmpi is ${MPI_LIB}")
+	if (NOT MPI_H OR NOT MPI_LIB)
+	    set (MPI_MPICXX "MPI_MPICXX-NOTFOUND")
+	    set (MPI_H "MPI_H-NOTFOUND")
+	    set (MPI_LIB "MPI_LIB-NOTFOUND")
+	else()
+            add_definitions(-DMPI_SUPPORT)
+            include_directories(${mpi_inc_dir})
+            link_libraries(${MPI_LIB})
+	endif()
+    else()
+        message ("-- ${mpi_compiler} not found")
+    endif()
+endmacro()
+cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
+project(RCCL-tests VERSION 2.12.10 LANGUAGES CXX)
+# Get ROCm path from environment if available
+if (DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
+else()
+    set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
+endif()
+# Set CMake/CPack variables
+list( APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm)
+set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Prefix install path")
+set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.")
+set(CMAKE_CXX_STANDARD 14)
+# Get additional packages required
+find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}")
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMCheckTargetIds)
+include(ROCMClients)
+# Build variables
+option(NO_MPI "Build RCCL-tests without MPI support.")
+option(MPI_PATH "Use MPI in the specified directory.")
+# Default GPU architectures to build
+#==================================================================================================
+set(DEFAULT_GPUS
+      gfx803
+      gfx900:xnack-
+      gfx906:xnack-
+      gfx908:xnack-
+      gfx90a:xnack-
+      gfx90a:xnack+
+      gfx940
+      gfx941
+      gfx942
+      gfx1030
+      gfx1100
+      gfx1101
+      gfx1102)
+set(AMDGPU_TARGETS ${DEFAULT_GPUS} CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined.")
+## Determine which GPU architectures to build for
+if (COMMAND rocm_check_target_ids)
+    message(STATUS "Checking for ROCm support for GPU targets:")
+    rocm_check_target_ids(SUPPORTED_GPUS TARGETS "${AMDGPU_TARGETS}")
+else()
+    message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs")
+    set(SUPPORTED_GPUS ${DEFAULT_GPUS})
+endif()
+set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architectures to build for.")
+message(STATUS "Compiling for ${GPU_TARGETS}")
+find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
+if (NOT NO_MPI)
+    # CHECK for MPI Path first. User requested this directory explicitely
+    if (MPI_PATH)
+        set(mpi_spec_bin_dir "${MPI_PATH}/bin")
+	set(mpi_spec_inc_dir "${MPI_PATH}/include")
+        check_mpi(mpicxx libmpi.a libmpi.so ${mpi_spec_bin_dir} ${MPI_PATH} ${mpi_spec_inc_dir})
+	if (NOT MPI_MPICXX)
+            # Since the user explicitely requested this directory, abort if something went wrong.
+	    MESSAGE(FATAL_ERROR "Could not find MPI in ${MPI_PATH}")
+        endif()
+    endif()
+    # Check for MPICH Ubuntu installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx.mpich libmpich.a libmpich.so /usr/bin /usr /usr/include/x86_64-linux-gnu/mpich)
+    endif()
+    # Check for Open MPI Ubuntu installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx.openmpi libmpi.a libmpi.so /usr/bin  /usr/lib/x86_64-linux-gnu/openmpi /usr/lib/x86_64-linux-gnu/openmpi/include)
+    endif()
+    # Check for MPICH RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpich/bin /usr/lib64/mpich /usr/include/mpich-x86_64)
+    endif()
+    # Check for Open MPI RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
+    endif()
+    # Check for MPICH SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpi/gcc/mpich/bin /usr/lib64/mpi/gcc/mpich /usr/lib64/mpi/gcc/mpich/include)
+    endif()
+    # Check for Open MPI v4 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi4/bin /usr/lib64/mpi/gcc/openmpi4 /usr/lib64/mpi/gcc/openmpi4/include)
+    endif()
+    # Check for Open MPI v3 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi3/bin /usr/lib64/mpi/gcc/openmpi3 /usr/lib64/mpi/gcc/openmpi3/include)
+    endif()
+    # Check for Open MPI v2 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi2/bin /usr/lib64/mpi/gcc/openmpi2 /usr/lib64/mpi/gcc/openmpi2/include)
+    endif()
+    if (NOT MPI_MPICXX)
+        message ("-- no MPI library found")
+    endif()
+else()
+    message ("-- MPI support explicitely disabled")
+endif()
+set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
+# Add all of the tests
+add_subdirectory(src)
+# Create ROCm standard packages
+rocm_create_package(
+    NAME rccl-tests
+    DESCRIPTION "Tests for the ROCm Communication Collectives Library"
+    MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
+)
--- a/LICENSE.txt
+++ b/LICENSE.txt
+ Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+ Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/Makefile
+++ b/Makefile
+#
+# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENCE.txt for license information
+#
+BUILDDIR ?= build
+override BUILDDIR := $(abspath $(BUILDDIR))
+.PHONY: all clean
+default: src.build
+TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
+all:   ${TARGETS:%=%.build}
+clean: ${TARGETS:%=%.clean}
+%.build:
+	${MAKE} -C $* build BUILDDIR=${BUILDDIR}
+%.clean:
+	${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
--- a/NOTICES.txt
+++ b/NOTICES.txt
+Notices and Licenses file
+_______________________________________________________________
+Dependencies on nvidia-nccl-tests v2.0.0 (BSD3)
+Copyright (c) 2016-2017, NVIDIA CORPORATION.
+Modifications Copyright (c) 2019 Advanced Micro Devices, Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+nvidia-nccl-tests v2.0.0 (BSD2)
+Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
--- a/README.md
+++ b/README.md
+# RCCL Tests
+These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl).
+## Build
+To build the tests, just type `make`.
+If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify NCCL\_HOME and CUSTOM\_RCCL\_LIB.
+```shell
+$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rccl/lib/librccl.so
+```
+RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
+```shell
+$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
+```
+RCCL tests can also be built using cmake. A typical sequence will be:
+```shell
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl ..
+$ make
+```
+When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
+for cmake target and config files that are created during the RCCL build.
+Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitly request
+MPI builds. A user can request to use a particular MPI library by using the MPI_PATH variable. MPI support can be explicitely disabled by adding the -DNO_MPI=1
+flag to the cmake command line.
+## Usage
+RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
+### Quick examples
+Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+```shell
+$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
+```
+Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
+```shell
+$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
+```
+### Performance
+See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.
+### Arguments
+All tests support the same set of arguments :
+* Number of GPUs
+  * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
+  * `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
+* Sizes to scan
+  * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
+  * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
+  * Increments can be either fixed or a multiplication factor. Only one of those should be used
+    * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
+    * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
+* RCCL operations arguments
+  * `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
+  * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
+  * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
+  * `-y,--memory_type <coarse/fine/host/managed>` Default: Coarse
+  * `-s,--stress_cycles <number of cycles>` Default: 1
+  * `-u,--cumask <d0,d1,d2,d3>` Default: None
+* Performance
+  * `-n,--iters <iteration count>` number of iterations. Default : 20.
+  * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
+  * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
+* Test operation
+  * `-p,--parallel_init <0/1>` use threads to initialize RCCL in parallel. Default : 0.
+  * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
+  * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
+  * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+## Unit tests
+Unit tests for rccl-tests are implemented with pytest (python3 is also required).  Several notes for the unit tests:
+1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
+2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.
+The unit tests can be invoked within the rccl-tests root, or in the test subfolder.  An example call to the unit tests:
+```shell
+$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest
+```
+## Copyright
+RCCL tests are provided under the BSD license.
+All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
--- a/doc/PERFORMANCE.md
+++ b/doc/PERFORMANCE.md
+# Performance reported by RCCL tests
+RCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
+# Time
+Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations.
+On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but
+also the bandwidth multiplied by the size.
+Therefore, on large sizes, it makes more sense to look at the bandwidth.
+# Bandwidth
+## Algorithm bandwidth
+Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth.
+`algbw = S/t`
+## Bus bandwidth
+While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks.
+Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase).
+To provide a number which reflects how optimally the hardware is used, RCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
+This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication.
+Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+The formula depends on the collective operation.
+### AllReduce
+An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation :
+`o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}`
+**Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).**
+A ring would do that operation in an order which follows the ring :
+`i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}`
+A tree would do it hierarchically :
+`(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
+In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output),
+we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation.
+Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best :
+ `t = (S*2*(n-1)) / (n*B)`
+Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them.
+Reordering the equation, we find that
+ `t = (S/B) * (2*(n-1)/n)`
+Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute :
+ `B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)`
+### ReduceScatter
+The ReduceScatter operation requires only to perform the addition part of the allReduce operation :
+ `o_K = i_0 + i_1 + i_2 + ... + i_{n-1}`
+With K being the rank which is getting the final result(K=offset/recvsize).
+The perfect reduceScatter time with a rank bandwidth of B would therefore be :
+ `t = S*(n-1) / (B*n)`
+And the Bus Bandwidth is therefore computed as :
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+Note that here, S is the size in bytes of the total array, which for RCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
+### AllGather
+The AllGather operation requires only to perform the assignment part of the allReduce operation :
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K`
+With K being the rank where the data originates from (K=offset*sendsize).
+The perfect allGather time with a rank bandwidth of B would therefore be :
+ `t = S*(n-1) / (B*n)`
+And the Bus Bandwidth is therefore computed as :
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+Note that here, S is the size in bytes of the total array, which for RCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
+### Broadcast
+The broadcast operation representation is similar to allGather :
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_R`
+R being the root of the operation.
+However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations.
+Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out :
+ `t = S/B`
+And :
+ `B = S/t`
+### Reduce
+The reduce operation performs :
+ `o_R = i_0 + i_1 + i_2 + ... + i_{n-1}`
+R being the root of the operation.
+Similarly to broadcast, all data need to be sent to the root, hence :
+ `t = S/B`
+And :
+ `B = S/t`
+### Summary
+To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth :
+* AllReduce : 2*(_n_-1)/_n_
+* ReduceScatter : (_n_-1)/_n_
+* AllGather : (_n_-1)/_n_
+* Broadcast : 1
+* Reduce : 1
+The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
--- a/docker/dockerfile-build-centos
+++ b/docker/dockerfile-build-centos
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# This Dockerfile provides a starting point for a ROCm installation of rccl
+# Parameters related to building rccl
+ARG base_image
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+USER root
+ARG user_uid
+# Install dependent packages
+RUN yum install -y --nogpgcheck \
+    sudo \
+    chrpath \
+    rock-dkms \
+    rocm-cmake \
+    centos-release-scl \
+    devtoolset-7 \
+    ca-certificates \
+    git \
+    cmake3 \
+    make \
+    libgomp \
+    clang \
+    clang-devel \
+    gcc-c++ \
+    pkgconfig \
+    numactl-libs 
+RUN echo '#!/bin/bash' | tee /etc/profile.d/devtoolset7.sh && echo \
+    'source scl_source enable devtoolset-7' >>/etc/profile.d/devtoolset7.sh
+# docker pipeline runs containers with particular uid
+# create a jenkins user with this specific uid so it can use sudo priviledges
+# Grant any member of sudo group password-less sudo privileges
+RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
+    echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd && \
+    chmod 400 /etc/sudoers.d/sudo-nopasswd
--- a/docker/dockerfile-build-ubuntu-rock
+++ b/docker/dockerfile-build-ubuntu-rock
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+ARG user_uid
+# Install dependent packages
+# Dependencies:
+# * hcc-config.cmake: pkg-config
+# * tensile: python2.7, python-yaml
+# * rocblas-test: gfortran, googletest
+# * rocblas-bench: libboost-program-options-dev
+# * libhsakmt.so: libnuma1
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    rock-dkms \
+    sudo \
+    ca-certificates \
+    chrpath \
+    git \
+    make \
+    cmake \
+    pkg-config \
+    python2.7 \
+    python-yaml \
+    python3-pytest \
+    rocm-cmake \
+    libboost-program-options-dev \
+    libnuma1 \
+    libomp-dev \
+    && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+# docker pipeline runs containers with particular uid
+# create a jenkins user with this specific uid so it can use sudo priviledges
+# Grant any member of sudo group password-less sudo privileges
+RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
+    mkdir -p /etc/sudoers.d/ && \
+    echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd
--- a/docker/dockerfile-install-centos
+++ b/docker/dockerfile-install-centos
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+#empty for now
--- a/docker/dockerfile-install-ubuntu
+++ b/docker/dockerfile-install-ubuntu
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+#empty for now
--- a/install.sh
+++ b/install.sh
+#!/bin/bash
+# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+# #################################################
+# helper functions
+# #################################################
+function display_help()
+{
+    echo "RCCL-tests build & installation helper script"
+    echo "./install [-h|--help] "
+    echo "    [-h|--help] Prints this help message."
+    echo "    [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)"
+    echo "    [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)"
+    echo "    [--mpi_home] Specify path to your MPI installation."
+}
+# #################################################
+# global variables
+# #################################################
+run_tests=false
+build_release=true
+mpi_enabled=false
+rccl_dir=/opt/rocm/rccl
+mpi_dir=""
+# #################################################
+# Parameter parsing
+# #################################################
+# check if we have a modern version of getopt that can handle whitespace and long parameters
+getopt -T
+if [[ $? -eq 4 ]]; then
+    GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@")
+else
+    echo "Need a new version of getopt"
+    exit 1
+fi
+if [[ $? -ne 0 ]]; then
+    echo "getopt invocation failed; could not parse the command line";
+    exit 1
+fi
+eval set -- "${GETOPT_PARSE}"
+while true; do
+    case "${1}" in
+	-h|--help)
+        display_help
+        exit 0
+        ;;
+	-m|--mpi)
+	    mpi_enabled=true
+	    shift ;;
+	-t|--test)
+	    run_tests=true
+	    shift ;;
+    --rccl_home)
+        rccl_dir=${2}
+        shift 2 ;;
+    --mpi_home)
+        mpi_dir=${2}
+        shift 2 ;;
+	--) shift ; break ;;
+	*)  echo "Unexpected command line parameter received; aborting";
+	    exit 1
+	    ;;
+    esac
+    done
+# throw error code after running a command in the install script
+check_exit_code( )
+{
+  if (( $1 != 0 )); then
+    exit $1
+  fi
+}
+# Install the pre-commit hook
+#bash ./githooks/install
+build_dir=./build
+# #################################################
+# prep
+# #################################################
+# ensure a clean build environment
+rm -rf ${build_dir}
+if ($mpi_enabled); then
+    if [[ ${mpi_dir} == "" ]]; then
+        echo "MPI flag enabled but path to MPI installation not specified.  See --mpi_home command line argument."
+        exit 1
+    else
+        make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc)
+    fi
+else
+    make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc)
+fi
+check_exit_code "$?"
+# Optionally, run tests if they're enabled.
+if ($run_tests); then
+    if ($mpi_enabled); then
+        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
+    else
+        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest
+    fi
+fi
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
+# ########################################################################
+# Copyright 2022 Advanced Micro Devices, Inc.
+# ########################################################################
+# Compile common object library
+set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
+add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
+target_link_libraries(rccl_common roc::rccl hip::device)
+if(USE_MPI)
+    target_link_libraries(rccl_common MPI::MPI_CXX)
+endif()
+function(add_relative_test test_name test_target)
+    get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
+    if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
+        set(EXE_PATH ".")
+    endif()
+    get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+    get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
+    if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+        get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
+        if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
+            set(EXE_NAME "${test_target}")
+        endif()
+    endif()
+    file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
+    add_test(NAME "${test_name}" COMMAND "./${rel_path}")
+endfunction()
+function(add_rccl_test TEST)
+    set(TEST_SOURCE "${TEST}.cu")
+    set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
+    set(TEST_TARGET "${TEST}_perf")
+    add_executable(${TEST_TARGET} ${TEST_SOURCE})
+    target_link_libraries(
+        ${TEST_TARGET}
+        PRIVATE
+            rccl_common
+    )
+    set_target_properties(
+        ${TEST_TARGET}
+        PROPERTIES
+            RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+            # LINKER_LANGUAGE CXX
+    )
+    add_relative_test(${TEST} ${TEST_TARGET})
+    rocm_install(TARGETS ${TEST_TARGET})
+    # TODO: copy/install DLLs on Windows
+    set_target_properties(
+        ${TEST_TARGET} PROPERTIES
+        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib"
+    )
+endfunction()
+add_rccl_test(all_gather)
+add_rccl_test(all_reduce)
+add_rccl_test(alltoall)
+add_rccl_test(alltoallv)
+add_rccl_test(broadcast)
+add_rccl_test(gather)
+add_rccl_test(hypercube)
+add_rccl_test(reduce_scatter)
+add_rccl_test(reduce)
+add_rccl_test(scatter)
+add_rccl_test(sendrecv)
--- a/src/Makefile
+++ b/src/Makefile
+#
+# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE.txt for license information
+#
+ROCM_PATH ?= /opt/rocm
+MPI_HOME ?= /usr/lib/openmpi
+PREFIX ?= /usr/local
+VERBOSE ?= 0
+DEBUG ?= 0
+NCCL_HOME ?= ""
+HIPCC = $(ROCM_PATH)/bin/hipcc
+CXX = $(HIPCC)
+HIPCUFLAGS := -std=c++14
+LDFLAGS    :=
+HIPLDFLAGS :=
+ifneq ($(NCCL_HOME), "")
+HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
+HIPLDFLAGS   += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
+endif
+HIPCUFLAGS += -I$(ROCM_PATH)/include
+HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
+LDFLAGS    += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
+HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
+ifeq ($(DEBUG), 0)
+HIPCUFLAGS += -O3
+else
+HIPCUFLAGS += -O0 -g -ggdb3
+endif
+ifeq ($(VERBOSE), 0)
+.SILENT:
+endif
+.PHONY: build clean
+BUILDDIR ?= ../build
+ifeq ($(MPI), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
+HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
+else ifeq ($(MPICH), 1)
+HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
+HIPLDFLAGS += -L/usr/lib -lmpich
+endif
+LIBRARIES += rccl
+HIPLDFLAGS   += $(LIBRARIES:%=-l%)
+DST_DIR := $(BUILDDIR)
+SRC_FILES := $(wildcard *.cu)
+OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
+BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv
+BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
+build: ${BIN_FILES}
+clean:
+	rm -rf ${DST_DIR}
+TEST_VERIFIABLE_SRCDIR := ../verifiable
+TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
+include ../verifiable/verifiable.mk
+${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
+${DST_DIR}/timer.o: timer.cc timer.h
+	@printf "Compiling  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
+${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
+	@printf "Linking  %-35s > %s\n" $< $@
+	@mkdir -p ${DST_DIR}
+	echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
+	$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}
--- a/src/all_gather.cu
+++ b/src/all_gather.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <hip/hip_runtime.h>
+#include "common.h"
+#define ALIGN 4
+void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  size_t base = (count/(ALIGN*nranks))*ALIGN;
+  *sendcount = base;
+  *recvcount = base*nranks;
+  *sendInplaceOffset = base;
+  *recvInplaceOffset = 0;
+  *paramcount = base;
+}
+testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+      for (int j=0; j<nranks; j++) {
+	TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
+      }
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
+  *algBw = baseBw;
+  double factor = ((double)(nranks - 1))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
+  return testSuccess;
+}
+struct testColl allGatherTest = {
+  "AllGather",
+  AllGatherGetCollByteCount,
+  AllGatherInitData,
+  AllGatherGetBw,
+  AllGatherRunColl
+};
+void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allGatherTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+  for (int i=0; i<type_count; i++) {
+    TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+  }
+  return testSuccess;
+}
+struct testEngine ncclTestEngine = {
+  AllGatherGetBuffSize,
+  AllGatherRunTest
+};
--- a/src/all_reduce.cu
+++ b/src/all_reduce.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <hip/hip_runtime.h>
+#include "common.h"
+void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+  int k = 0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
+      TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  double factor = ((double)(2*(nranks - 1)))/((double)nranks);
+  *busBw = baseBw * factor;
+}
+testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
+  return testSuccess;
+}
+struct testColl allReduceTest = {
+  "AllReduce",
+  AllReduceGetCollByteCount,
+  AllReduceInitData,
+  AllReduceGetBw,
+  AllReduceRunColl
+};
+void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &allReduceTest;
+  ncclDataType_t *run_types;
+  ncclRedOp_t *run_ops;
+  const char **run_typenames, **run_opnames;
+  int type_count, op_count;
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+  if ((int)op != -1) {
+    op_count = 1;
+    run_ops = &op;
+    run_opnames = &opName;
+  } else {
+    op_count = test_opnum;
+    run_ops = test_ops;
+    run_opnames = test_opnames;
+  }
+  for (int i=0; i<type_count; i++) {
+    for (int j=0; j<op_count; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
+    }
+  }
+  return testSuccess;
+}
+struct testEngine ncclTestEngine = {
+  AllReduceGetBuffSize,
+  AllReduceRunTest
+};
--- a/src/alltoall.cu
+++ b/src/alltoall.cu
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <hip/hip_runtime.h>
+#include "common.h"
+void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = (count/nranks)*nranks;
+  *recvcount = (count/nranks)*nranks;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = count/nranks;
+}
+testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
+      for (int j=0; j<nranks; j++) {
+        size_t partcount = sendcount/nranks;
+	TESTCHECK(InitData(((char*)args->expected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
+      }
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  // We don't support in-place alltoall
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
+  return testSuccess;
+}
+struct testColl alltoAllTest = {
+  "AlltoAll",
+  AlltoAllGetCollByteCount,
+  AlltoAllInitData,
+  AlltoAllGetBw,
+  AlltoAllRunColl
+};
+void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &alltoAllTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
+  }
+  return testSuccess;
+}
+struct testEngine ncclTestEngine = {
+  AlltoAllGetBuffSize,
+  AlltoAllRunTest
+};
--- a/src/alltoallv.cu
+++ b/src/alltoallv.cu
+/*************************************************************************
+ * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <hip/hip_runtime.h>
+#include "common.h"
+#define USE_RCCL_GATHER_SCATTER
+void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  if (count < nranks*nranks/2) {
+    *sendcount = 0;
+    *recvcount = 0;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = 0;
+    *paramcount = 0;
+  } else {
+    *sendcount = (count/nranks)*nranks;
+    *recvcount = (count/nranks)*nranks;
+    *sendInplaceOffset = 0;
+    *recvInplaceOffset = 0;
+    *paramcount = count/nranks;
+  }
+}
+testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
+#if 0
+      int *dataHost = (int *)malloc(args->sendBytes);
+      hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
+      printf(" Rank [%d] Original: ", rank);
+      for(int j=0; j<sendcount; j++) {
+	printf("%d:%d ", j, dataHost[j]);
+      }
+      printf("\n");
+      free(dataHost);
+#endif
+      size_t rdisp = 0;
+      size_t data_count = sendcount*2/nranks;
+      size_t chunksize = data_count/nranks;
+      for (int j=0; j<nranks; j++) {
+        size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
+        if ((j+rank)%nranks == 0)
+          rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
+        size_t sdisp = 0;
+        for (int kk=0; kk<nranks; kk++) {
+          scount = ((kk+j)%nranks)*chunksize;
+          if ((kk+j)%nranks == 0)
+            scount += (sendcount-chunksize*(nranks-1)*nranks/2);
+          if (kk == rank)
+            break;
+          sdisp += scount;
+        }
+        TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
+        rdisp += rcount;
+      }
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  // We don't support in-place alltoall
+  args->reportErrors = in_place ? 0 : 1;
+  return testSuccess;
+}
+void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  double factor = ((double)(nranks-1))/((double)(nranks));
+  *busBw = baseBw * factor;
+}
+testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int nranks;
+  NCCLCHECK(ncclCommCount(comm, &nranks));
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+  if (count == 0) return testSuccess;
+  size_t *sendcounts, *recvcounts, *sdispls, *rdispls;
+  sendcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  recvcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  sdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  rdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
+  if (sendcounts == nullptr || recvcounts == nullptr || sdispls == nullptr || rdispls == nullptr) {
+    printf("failed to allocate buffers for alltoallv\n");
+    return testNcclError;
+  }
+  size_t disp = 0;
+  size_t chunksize = count*2/nranks;
+  for (int i = 0; i < nranks; i++) {
+      size_t scount = ((i+rank)%nranks)*chunksize;
+      if ((i+rank)%nranks == 0)
+          scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
+      sendcounts[i+rank*nranks] = recvcounts[i+rank*nranks] = scount;
+      sdispls[i+rank*nranks] = rdispls[i+rank*nranks] = disp;
+      disp += scount;
+      //printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*nranks]*wordSize(type), sdispls[i+rank*nranks]*wordSize(type));
+  }
+#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
+  printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
+  return testNcclError;
+#else
+#if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER)
+  NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*nranks, sdispls+rank*nranks, recvbuff, recvcounts+rank*nranks, rdispls+rank*nranks, type, comm, stream));
+#else
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nranks; r++) {
+    if (sendcounts[r+rank*nranks] != 0) {
+      NCCLCHECK(ncclSend(
+          ((char*)sendbuff) + sdispls[r+rank*nranks] * wordSize(type),
+          sendcounts[r+rank*nranks],
+          type,
+          r,
+          comm,
+          stream));
+    }
+    if (recvcounts[r+rank*nranks] != 0) {
+      NCCLCHECK(ncclRecv(
+          ((char*)recvbuff) + rdispls[r+rank*nranks] * wordSize(type),
+          recvcounts[r+rank*nranks],
+          type,
+          r,
+          comm,
+          stream));
+    }
+  }
+  NCCLCHECK(ncclGroupEnd());
+#endif
+#endif
+  free(sendcounts);
+  free(recvcounts);
+  free(sdispls);
+  free(rdispls);
+  return testSuccess;
+}
+struct testColl alltoAllTest = {
+  "AlltoAllv",
+  AlltoAllvGetCollByteCount,
+  AlltoAllvInitData,
+  AlltoAllvGetBw,
+  AlltoAllvRunColl
+};
+void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &alltoAllTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = ncclNumTypes;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+  for (int i=0; i<type_count; i++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
+  }
+  return testSuccess;
+}
+struct testEngine ncclTestEngine = {
+  AlltoAllvGetBuffSize,
+  AlltoAllvRunTest
+};
--- a/src/broadcast.cu
+++ b/src/broadcast.cu
+/*************************************************************************
+ * Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#include <hip/hip_runtime.h>
+#include "common.h"
+void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
+  *sendcount = count;
+  *recvcount = count;
+  *sendInplaceOffset = 0;
+  *recvInplaceOffset = 0;
+  *paramcount = *sendcount;
+}
+testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
+  size_t sendcount = args->sendBytes / wordSize(type);
+  size_t recvcount = args->expectedBytes / wordSize(type);
+  int k=0;
+  for (int i=0; i<args->nGpus; i++) {
+    HIPCHECK(hipSetDevice(args->gpus[i]));
+    for (int l=0; l<args->nRanks; l++) {
+      int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
+      HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
+      void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
+      if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
+      TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0));
+      k++;
+    }
+    HIPCHECK(hipDeviceSynchronize());
+  }
+  return testSuccess;
+}
+void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
+  double baseBw = (double)(count * typesize) / 1.0E9 / sec;
+  *algBw = baseBw;
+  double factor = 1;
+  *busBw = baseBw * factor;
+}
+testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
+  int rank;
+  NCCLCHECK(ncclCommUserRank(comm, &rank));
+#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
+  NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream));
+#else
+  if (rank == root) {
+      NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream));
+  } else {
+      NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream));
+  }
+#endif
+  return testSuccess;
+}
+struct testColl broadcastTest = {
+  "Broadcast",
+  BroadcastGetCollByteCount,
+  BroadcastInitData,
+  BroadcastGetBw,
+  BroadcastRunColl
+};
+void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
+  size_t paramcount, sendInplaceOffset, recvInplaceOffset;
+  BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
+}
+testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
+  args->collTest = &broadcastTest;
+  ncclDataType_t *run_types;
+  const char **run_typenames;
+  int type_count;
+  int begin_root, end_root;
+  if ((int)type != -1) {
+    type_count = 1;
+    run_types = &type;
+    run_typenames = &typeName;
+  } else {
+    type_count = test_typenum;
+    run_types = test_types;
+    run_typenames = test_typenames;
+  }
+  if (root != -1) {
+    begin_root = end_root = root;
+  } else {
+    begin_root = 0;
+    end_root = args->nProcs*args->nThreads*args->nGpus-1;
+  }
+  for (int i=0; i<type_count; i++) {
+    for (int j=begin_root; j<=end_root; j++) {
+      TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
+    }
+  }
+  return testSuccess;
+}
+struct testEngine ncclTestEngine = {
+  BroadcastGetBuffSize,
+  BroadcastRunTest
+};
--- a/src/common.cu
+++ b/src/common.cu
--- a/src/common.h
+++ b/src/common.h
+/*************************************************************************
+ * Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
+ * Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef __COMMON_H__
+#define __COMMON_H__
+#include "rccl/rccl.h"
+#include <stdio.h>
+#include <cstdint>
+#include <algorithm>
+#ifdef MPI_SUPPORT
+#include "mpi.h"
+#endif
+#include <pthread.h>
+#include "nccl1_compat.h"
+#include "timer.h"
+// For nccl.h < 2.13 since we define a weak fallback
+extern "C" char const* ncclGetLastError(ncclComm_t comm);
+#define HIPCHECK(cmd) do {                          \
+  hipError_t e = cmd;                               \
+  if( e != hipSuccess ) {                           \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test HIP failure %s:%d '%s'\n",     \
+         hostname,                                  \
+        __FILE__,__LINE__,hipGetErrorString(e));    \
+    return testCudaError;                           \
+  }                                                 \
+} while(0)
+#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d "           \
+           "'%s / %s'\n",                           \
+           hostname,__FILE__,__LINE__,              \
+           ncclGetErrorString(res),                 \
+           ncclGetLastError(NULL));                 \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+#else
+#define NCCLCHECK(cmd) do {                         \
+  ncclResult_t res = cmd;                           \
+  if (res != ncclSuccess) {                         \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf("%s: Test NCCL failure %s:%d '%s'\n",    \
+         hostname,                                  \
+        __FILE__,__LINE__,ncclGetErrorString(res)); \
+    return testNcclError;                           \
+  }                                                 \
+} while(0)
+#endif
+typedef enum {
+  testSuccess = 0,
+  testInternalError = 1,
+  testCudaError = 2,
+  testNcclError = 3,
+  testTimeout = 4,
+  testNumResults = 5
+} testResult_t;
+// Relay errors up and trace
+#define TESTCHECK(cmd) do {                         \
+  testResult_t r = cmd;                             \
+  if (r!= testSuccess) {                            \
+    char hostname[1024];                            \
+    getHostName(hostname, 1024);                    \
+    printf(" .. %s pid %d: Test failure %s:%d\n",   \
+         hostname, getpid(),                        \
+        __FILE__,__LINE__);                         \
+    return r;                                       \
+  }                                                 \
+} while(0)
+struct testColl {
+  const char name[20];
+  void (*getCollByteCount)(
+      size_t *sendcount, size_t *recvcount, size_t *paramcount,
+      size_t *sendInplaceOffset, size_t *recvInplaceOffset,
+      size_t count, int nranks);
+  testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
+      ncclRedOp_t op, int root, int rep, int in_place);
+  void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
+  testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
+      ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
+};
+extern struct testColl allReduceTest;
+extern struct testColl allGatherTest;
+extern struct testColl reduceScatterTest;
+extern struct testColl broadcastTest;
+extern struct testColl reduceTest;
+extern struct testColl alltoAllTest;
+struct testEngine {
+  void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
+  testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
+      const char* typeName, ncclRedOp_t op, const char* opName);
+};
+extern struct testEngine ncclTestEngine;
+struct threadArgs {
+  size_t nbytes;
+  size_t minbytes;
+  size_t maxbytes;
+  size_t stepbytes;
+  size_t stepfactor;
+  int totalProcs;
+  int nProcs;
+  int proc;
+  int nThreads;
+  int thread;
+  int nGpus;
+  int* gpus;
+  int localRank;
+  int localNumDevices;
+  int enable_multiranks;
+  int enable_out_of_place;
+  int nRanks;
+  void** sendbuffs;
+  size_t sendBytes;
+  size_t sendInplaceOffset;
+  void** recvbuffs;
+  size_t recvInplaceOffset;
+  ncclUniqueId ncclId;
+  ncclComm_t* comms;
+  hipStream_t* streams;
+  void** expected;
+  size_t expectedBytes;
+  int* errors;
+  double* bw;
+  int* bw_count;
+  int reportErrors;
+  struct testColl* collTest;
+};
+typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
+struct testThread {
+  pthread_t thread;
+  threadFunc_t func;
+  struct threadArgs args;
+  testResult_t ret;
+};
+// Provided by common.cu
+extern void Barrier(struct threadArgs* args);
+extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op,  const char* opName, int root);
+extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
+extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
+extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
+#include <unistd.h>
+static void getHostName(char* hostname, int maxlen) {
+  gethostname(hostname, maxlen);
+  for (int i=0; i< maxlen; i++) {
+    if (hostname[i] == '.') {
+      hostname[i] = '\0';
+      return;
+    }
+  }
+}
+#include <stdint.h>
+static uint64_t getHash(const char* string, size_t n) {
+  // Based on DJB2a, result = result * 33 ^ char
+  uint64_t result = 5381;
+  for (size_t c = 0; c < n; c++) {
+    result = ((result << 5) + result) ^ string[c];
+  }
+  return result;
+}
+/* Generate a hash of the unique identifying string for this host
+ * that will be unique for both bare-metal and container instances
+ * Equivalent of a hash of;
+ *
+ * $(hostname)$(cat /proc/sys/kernel/random/boot_id)
+ *
+ */
+#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
+static uint64_t getHostHash(const char* hostname) {
+  char hostHash[1024];
+  // Fall back is the hostname if something fails
+  (void) strncpy(hostHash, hostname, sizeof(hostHash));
+  int offset = strlen(hostHash);
+  FILE *file = fopen(HOSTID_FILE, "r");
+  if (file != NULL) {
+    char *p;
+    if (fscanf(file, "%ms", &p) == 1) {
+        strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
+        free(p);
+    }
+  }
+  fclose(file);
+  // Make sure the string is terminated
+  hostHash[sizeof(hostHash)-1]='\0';
+  return getHash(hostHash, strlen(hostHash));
+}
+static size_t wordSize(ncclDataType_t type) {
+  switch(type) {
+    case ncclChar:
+#if NCCL_MAJOR >= 2
+    //case ncclInt8:
+    case ncclUint8:
+#endif
+      return 1;
+    case ncclHalf:
+#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
+    case ncclBfloat16:
+#endif
+    //case ncclFloat16:
+      return 2;
+    case ncclInt:
+    case ncclFloat:
+#if NCCL_MAJOR >= 2
+    //case ncclInt32:
+    case ncclUint32:
+    //case ncclFloat32:
+#endif
+      return 4;
+    case ncclInt64:
+    case ncclUint64:
+    case ncclDouble:
+    //case ncclFloat64:
+      return 8;
+    default: return 0;
+  }
+}
+extern int test_ncclVersion; // init'd with ncclGetVersion()
+typedef enum { ncclCoarse        = 0,
+               ncclFine          = 1,
+               ncclHost          = 2,
+               ncclManaged       = 3,
+               nccl_NUM_MTYPES   = 4 } ncclMemoryType_t;
+extern const char *test_memorytypes[nccl_NUM_MTYPES];
+constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
+extern int test_opnum;
+extern int test_typenum;
+extern ncclDataType_t test_types[ncclNumTypes];
+extern const char *test_typenames[ncclNumTypes];
+extern ncclRedOp_t test_ops[];
+extern const char *test_opnames[];
+static int ncclstringtotype(char *str) {
+    for (int t=0; t<ncclNumTypes; t++) {
+      if (strcmp(str, test_typenames[t]) == 0) {
+        return t;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
+    return ncclFloat;
+}
+static int ncclstringtoop (char *str) {
+    for (int o=0; o<test_opnum; o++) {
+      if (strcmp(str, test_opnames[o]) == 0) {
+        return o;
+      }
+    }
+    if (strcmp(str, "all") == 0) {
+      return -1;
+    }
+    printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
+    return ncclSum;
+}
+static int ncclstringtomtype (char *str) {
+    for (int o=0; o<nccl_NUM_MTYPES; o++) {
+      if (strcmp(str, test_memorytypes[o]) == 0) {
+        return o;
+      }
+    }
+    printf("invalid memorytype %s, defaulting to %s .. \n", str, test_memorytypes[ncclCoarse]);
+    return ncclCoarse;
+}
+extern int is_main_proc;
+extern thread_local int is_main_thread;
+#define PRINT if (is_main_thread) printf
+#endif