init

c0dad530 · wangkaixiong · c0dad530 · c0dad530 · c0dad530 · c0dad530
Commit c0dad530 authored Jun 09, 2025 by wangkaixiong 🚴🏼
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+# ########################################################################
+# Copyright 2022 Advanced Micro Devices, Inc.
+# ########################################################################
+#Adding pthread flag for linking
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
+    find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
+    if (MPI_MPICXX)
+        message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
+        find_file(MPI_H mpi.h PATHS ${mpi_inc_dir} NO_DEFAULT_PATH)
+        message ("-- mpi.h is in ${MPI_H}")
+        find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_lib_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu NO_DEFAULT_PATH)
+        message ("-- libmpi is ${MPI_LIB}")
+	if (NOT MPI_H OR NOT MPI_LIB)
+	    set (MPI_MPICXX "MPI_MPICXX-NOTFOUND")
+	    set (MPI_H "MPI_H-NOTFOUND")
+	    set (MPI_LIB "MPI_LIB-NOTFOUND")
+	else()
+            add_definitions(-DMPI_SUPPORT)
+            include_directories(${mpi_inc_dir})
+            link_libraries(${MPI_LIB})
+	endif()
+    else()
+        message ("-- ${mpi_compiler} not found")
+    endif()
+endmacro()
+cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
+project(RCCL-tests VERSION 2.12.10 LANGUAGES CXX)
+# Get ROCm path from environment if available
+if (DEFINED ENV{ROCM_PATH})
+    set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
+else()
+    set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
+endif()
+# Set CMake/CPack variables
+list( APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm)
+set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Prefix install path")
+set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.")
+set(CMAKE_CXX_STANDARD 14)
+# Get additional packages required
+find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}")
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMCheckTargetIds)
+include(ROCMClients)
+# Build variables
+option(NO_MPI "Build RCCL-tests without MPI support.")
+option(MPI_PATH "Use MPI in the specified directory.")
+# Default GPU architectures to build
+#==================================================================================================
+set(DEFAULT_GPUS
+      gfx803
+      gfx900:xnack-
+      gfx906:xnack-
+      gfx908:xnack-
+      gfx90a:xnack-
+      gfx90a:xnack+
+      gfx940
+      gfx941
+      gfx942
+      gfx1030
+      gfx1100
+      gfx1101
+      gfx1102)
+set(AMDGPU_TARGETS ${DEFAULT_GPUS} CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined.")
+## Determine which GPU architectures to build for
+if (COMMAND rocm_check_target_ids)
+    message(STATUS "Checking for ROCm support for GPU targets:")
+    rocm_check_target_ids(SUPPORTED_GPUS TARGETS "${AMDGPU_TARGETS}")
+else()
+    message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs")
+    set(SUPPORTED_GPUS ${DEFAULT_GPUS})
+endif()
+set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architectures to build for.")
+message(STATUS "Compiling for ${GPU_TARGETS}")
+find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
+if (NOT NO_MPI)
+    # CHECK for MPI Path first. User requested this directory explicitely
+    if (MPI_PATH)
+        set(mpi_spec_bin_dir "${MPI_PATH}/bin")
+	set(mpi_spec_inc_dir "${MPI_PATH}/include")
+        check_mpi(mpicxx libmpi.a libmpi.so ${mpi_spec_bin_dir} ${MPI_PATH} ${mpi_spec_inc_dir})
+	if (NOT MPI_MPICXX)
+            # Since the user explicitely requested this directory, abort if something went wrong.
+	    MESSAGE(FATAL_ERROR "Could not find MPI in ${MPI_PATH}")
+        endif()
+    endif()
+    # Check for MPICH Ubuntu installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx.mpich libmpich.a libmpich.so /usr/bin /usr /usr/include/x86_64-linux-gnu/mpich)
+    endif()
+    # Check for Open MPI Ubuntu installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx.openmpi libmpi.a libmpi.so /usr/bin  /usr/lib/x86_64-linux-gnu/openmpi /usr/lib/x86_64-linux-gnu/openmpi/include)
+    endif()
+    # Check for MPICH RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpich/bin /usr/lib64/mpich /usr/include/mpich-x86_64)
+    endif()
+    # Check for Open MPI RHEL installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
+    endif()
+    # Check for MPICH SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpi/gcc/mpich/bin /usr/lib64/mpi/gcc/mpich /usr/lib64/mpi/gcc/mpich/include)
+    endif()
+    # Check for Open MPI v4 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi4/bin /usr/lib64/mpi/gcc/openmpi4 /usr/lib64/mpi/gcc/openmpi4/include)
+    endif()
+    # Check for Open MPI v3 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi3/bin /usr/lib64/mpi/gcc/openmpi3 /usr/lib64/mpi/gcc/openmpi3/include)
+    endif()
+    # Check for Open MPI v2 SLES installation
+    if (NOT MPI_MPICXX)
+        check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi2/bin /usr/lib64/mpi/gcc/openmpi2 /usr/lib64/mpi/gcc/openmpi2/include)
+    endif()
+    if (NOT MPI_MPICXX)
+        message ("-- no MPI library found")
+    endif()
+else()
+    message ("-- MPI support explicitely disabled")
+endif()
+set(ROCM_USE_DEV_COMPONENT OFF)  # This repo doesn't have a dev component
+# Add all of the tests
+add_subdirectory(src)
+# Create ROCm standard packages
+rocm_create_package(
+    NAME rccl-tests
+    DESCRIPTION "Tests for the ROCm Communication Collectives Library"
+    MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
+)
--- a/LICENSE.txt
+++ b/LICENSE.txt
+ Copyright (c) 2016-2017, NVIDIA CORPORATION.  All rights reserved.
+ Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of NVIDIA CORPORATION, nor the names of their
+    contributors may be used to endorse or promote products derived
+    from this software without specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/Makefile
+++ b/Makefile
+#
+# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+#
+# See LICENCE.txt for license information
+#
+BUILDDIR ?= build
+override BUILDDIR := $(abspath $(BUILDDIR))
+.PHONY: all clean
+default: src.build
+TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
+all:   ${TARGETS:%=%.build}
+clean: ${TARGETS:%=%.clean}
+%.build:
+	${MAKE} -C $* build BUILDDIR=${BUILDDIR}
+%.clean:
+	${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
--- a/NOTICES.txt
+++ b/NOTICES.txt
+Notices and Licenses file
+_______________________________________________________________
+Dependencies on nvidia-nccl-tests v2.0.0 (BSD3)
+Copyright (c) 2016-2017, NVIDIA CORPORATION.
+Modifications Copyright (c) 2019 Advanced Micro Devices, Inc.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
+nvidia-nccl-tests v2.0.0 (BSD2)
+Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
+   Laboratory, the U.S. Department of Energy, nor the names of their
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+The U.S. Department of Energy funded the development of this software
+under subcontract 7078610 with Lawrence Berkeley National Laboratory.
--- a/README.md
+++ b/README.md
+# RCCL Tests
+These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl).
+## Build
+To build the tests, just type `make`.
+If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify NCCL\_HOME and CUSTOM\_RCCL\_LIB.
+```shell
+$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rccl/lib/librccl.so
+```
+RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
+```shell
+$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
+```
+RCCL tests can also be built using cmake. A typical sequence will be:
+```shell
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl ..
+$ make
+```
+When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
+for cmake target and config files that are created during the RCCL build.
+Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitly request
+MPI builds. A user can request to use a particular MPI library by using the MPI_PATH variable. MPI support can be explicitely disabled by adding the -DNO_MPI=1
+flag to the cmake command line.
+## Usage
+RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
+### Quick examples
+Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
+```shell
+$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
+```
+Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
+```shell
+$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
+```
+### Performance
+See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.
+### Arguments
+All tests support the same set of arguments :
+* Number of GPUs
+  * `-t,--nthreads <num threads>` number of threads per process. Default : 1.
+  * `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
+* Sizes to scan
+  * `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
+  * `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
+  * Increments can be either fixed or a multiplication factor. Only one of those should be used
+    * `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
+    * `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
+* RCCL operations arguments
+  * `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
+  * `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
+  * `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
+  * `-y,--memory_type <coarse/fine/host/managed>` Default: Coarse
+  * `-s,--stress_cycles <number of cycles>` Default: 1
+  * `-u,--cumask <d0,d1,d2,d3>` Default: None
+* Performance
+  * `-n,--iters <iteration count>` number of iterations. Default : 20.
+  * `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
+  * `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
+  * `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
+* Test operation
+  * `-p,--parallel_init <0/1>` use threads to initialize RCCL in parallel. Default : 0.
+  * `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
+  * `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
+  * `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
+## Unit tests
+Unit tests for rccl-tests are implemented with pytest (python3 is also required).  Several notes for the unit tests:
+1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
+2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.
+The unit tests can be invoked within the rccl-tests root, or in the test subfolder.  An example call to the unit tests:
+```shell
+$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest
+```
+## Copyright
+RCCL tests are provided under the BSD license.
+All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
+All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
--- a/build.log
+++ b/build.log
--- a/build.sh
+++ b/build.sh
+#!/bin/bash
+rm -rf ./build
+make MPI=1 MPI_HOME=/opt/mpi HIP_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl/ CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j 32 2>&1 | tee build.log
\ No newline at end of file
--- a/build.tgz
+++ b/build.tgz
--- a/doc/PERFORMANCE.md
+++ b/doc/PERFORMANCE.md
+# Performance reported by RCCL tests
+RCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
+# Time
+Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations.
+On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but
+also the bandwidth multiplied by the size.
+Therefore, on large sizes, it makes more sense to look at the bandwidth.
+# Bandwidth
+## Algorithm bandwidth
+Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth.
+`algbw = S/t`
+## Bus bandwidth
+While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks.
+Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase).
+To provide a number which reflects how optimally the hardware is used, RCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
+This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication.
+Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
+The formula depends on the collective operation.
+### AllReduce
+An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation :
+`o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}`
+**Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).**
+A ring would do that operation in an order which follows the ring :
+`i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}`
+A tree would do it hierarchically :
+`(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
+In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output),
+we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation.
+Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best :
+ `t = (S*2*(n-1)) / (n*B)`
+Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them.
+Reordering the equation, we find that
+ `t = (S/B) * (2*(n-1)/n)`
+Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute :
+ `B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)`
+### ReduceScatter
+The ReduceScatter operation requires only to perform the addition part of the allReduce operation :
+ `o_K = i_0 + i_1 + i_2 + ... + i_{n-1}`
+With K being the rank which is getting the final result(K=offset/recvsize).
+The perfect reduceScatter time with a rank bandwidth of B would therefore be :
+ `t = S*(n-1) / (B*n)`
+And the Bus Bandwidth is therefore computed as :
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+Note that here, S is the size in bytes of the total array, which for RCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
+### AllGather
+The AllGather operation requires only to perform the assignment part of the allReduce operation :
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_K`
+With K being the rank where the data originates from (K=offset*sendsize).
+The perfect allGather time with a rank bandwidth of B would therefore be :
+ `t = S*(n-1) / (B*n)`
+And the Bus Bandwidth is therefore computed as :
+ `B = S/t * (n-1)/n = algbw * (n-1)/n`
+Note that here, S is the size in bytes of the total array, which for RCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
+### Broadcast
+The broadcast operation representation is similar to allGather :
+ `o_0 = o_1 = o_2 = ... = o_{n-1} = i_R`
+R being the root of the operation.
+However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations.
+Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out :
+ `t = S/B`
+And :
+ `B = S/t`
+### Reduce
+The reduce operation performs :
+ `o_R = i_0 + i_1 + i_2 + ... + i_{n-1}`
+R being the root of the operation.
+Similarly to broadcast, all data need to be sent to the root, hence :
+ `t = S/B`
+And :
+ `B = S/t`
+### Summary
+To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth :
+* AllReduce : 2*(_n_-1)/_n_
+* ReduceScatter : (_n_-1)/_n_
+* AllGather : (_n_-1)/_n_
+* Broadcast : 1
+* Reduce : 1
+The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
--- a/docker/dockerfile-build-centos
+++ b/docker/dockerfile-build-centos
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# This Dockerfile provides a starting point for a ROCm installation of rccl
+# Parameters related to building rccl
+ARG base_image
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+USER root
+ARG user_uid
+# Install dependent packages
+RUN yum install -y --nogpgcheck \
+    sudo \
+    chrpath \
+    rock-dkms \
+    rocm-cmake \
+    centos-release-scl \
+    devtoolset-7 \
+    ca-certificates \
+    git \
+    cmake3 \
+    make \
+    libgomp \
+    clang \
+    clang-devel \
+    gcc-c++ \
+    pkgconfig \
+    numactl-libs 
+RUN echo '#!/bin/bash' | tee /etc/profile.d/devtoolset7.sh && echo \
+    'source scl_source enable devtoolset-7' >>/etc/profile.d/devtoolset7.sh
+# docker pipeline runs containers with particular uid
+# create a jenkins user with this specific uid so it can use sudo priviledges
+# Grant any member of sudo group password-less sudo privileges
+RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
+    echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd && \
+    chmod 400 /etc/sudoers.d/sudo-nopasswd
--- a/docker/dockerfile-build-ubuntu-rock
+++ b/docker/dockerfile-build-ubuntu-rock
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+ARG user_uid
+# Install dependent packages
+# Dependencies:
+# * hcc-config.cmake: pkg-config
+# * tensile: python2.7, python-yaml
+# * rocblas-test: gfortran, googletest
+# * rocblas-bench: libboost-program-options-dev
+# * libhsakmt.so: libnuma1
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    rock-dkms \
+    sudo \
+    ca-certificates \
+    chrpath \
+    git \
+    make \
+    cmake \
+    pkg-config \
+    python2.7 \
+    python-yaml \
+    python3-pytest \
+    rocm-cmake \
+    libboost-program-options-dev \
+    libnuma1 \
+    libomp-dev \
+    && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+# docker pipeline runs containers with particular uid
+# create a jenkins user with this specific uid so it can use sudo priviledges
+# Grant any member of sudo group password-less sudo privileges
+RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
+    mkdir -p /etc/sudoers.d/ && \
+    echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd
--- a/docker/dockerfile-install-centos
+++ b/docker/dockerfile-install-centos
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+#empty for now
--- a/docker/dockerfile-install-ubuntu
+++ b/docker/dockerfile-install-ubuntu
+# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+# Parameters related to building rccl
+ARG base_image
+FROM ${base_image}
+LABEL maintainer="rccl-maintainer@amd.com"
+#empty for now
--- a/graph-dump.xml
+++ b/graph-dump.xml
+<graphs version="1">
+  <graph id="0" pattern="4" crossnic="0" nchannels="8" speedintra="24" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PXB" samechannels="0">
+    <channel>
+      <net dev="1"/>
+      <gpu dev="1"/>
+      <gpu dev="3"/>
+      <gpu dev="5"/>
+      <gpu dev="7"/>
+      <gpu dev="2"/>
+      <gpu dev="4"/>
+      <gpu dev="6"/>
+      <gpu dev="0"/>
+      <net dev="1"/>
+    </channel>
+    <channel>
+      <net dev="0"/>
+      <gpu dev="1"/>
+      <gpu dev="3"/>
+      <gpu dev="5"/>
+      <gpu dev="7"/>
+      <gpu dev="2"/>
+      <gpu dev="4"/>
+      <gpu dev="6"/>
+      <gpu dev="0"/>
+      <net dev="0"/>
+    </channel>
+    <channel>
+      <net dev="2"/>
+      <gpu dev="2"/>
+      <gpu dev="0"/>
+      <gpu dev="1"/>
+      <gpu dev="5"/>
+      <gpu dev="7"/>
+      <gpu dev="4"/>
+      <gpu dev="6"/>
+      <gpu dev="3"/>
+      <net dev="2"/>
+    </channel>
+    <channel>
+      <net dev="3"/>
+      <gpu dev="2"/>
+      <gpu dev="0"/>
+      <gpu dev="1"/>
+      <gpu dev="5"/>
+      <gpu dev="7"/>
+      <gpu dev="4"/>
+      <gpu dev="6"/>
+      <gpu dev="3"/>
+      <net dev="3"/>
+    </channel>
+    <channel>
+      <net dev="5"/>
+      <gpu dev="4"/>
+      <gpu dev="0"/>
+      <gpu dev="3"/>
+      <gpu dev="1"/>
+      <gpu dev="2"/>
+      <gpu dev="7"/>
+      <gpu dev="6"/>
+      <gpu dev="5"/>
+      <net dev="5"/>
+    </channel>
+    <channel>
+      <net dev="6"/>
+      <gpu dev="4"/>
+      <gpu dev="0"/>
+      <gpu dev="3"/>
+      <gpu dev="1"/>
+      <gpu dev="2"/>
+      <gpu dev="7"/>
+      <gpu dev="6"/>
+      <gpu dev="5"/>
+      <net dev="6"/>
+    </channel>
+    <channel>
+      <net dev="7"/>
+      <gpu dev="6"/>
+      <gpu dev="0"/>
+      <gpu dev="3"/>
+      <gpu dev="5"/>
+      <gpu dev="1"/>
+      <gpu dev="2"/>
+      <gpu dev="4"/>
+      <gpu dev="7"/>
+      <net dev="7"/>
+    </channel>
+    <channel>
+      <net dev="8"/>
+      <gpu dev="6"/>
+      <gpu dev="0"/>
+      <gpu dev="3"/>
+      <gpu dev="5"/>
+      <gpu dev="1"/>
+      <gpu dev="2"/>
+      <gpu dev="4"/>
+      <gpu dev="7"/>
+      <net dev="8"/>
+    </channel>
+  </graph>
+  <graph id="1" pattern="1" crossnic="0" nchannels="8" speedintra="48" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PXB" samechannels="0">
+    <channel>
+      <net dev="1"/>
+      <gpu dev="1"/>
+      <gpu dev="0"/>
+      <gpu dev="7"/>
+      <gpu dev="6"/>
+      <gpu dev="5"/>
+      <gpu dev="4"/>
+      <gpu dev="3"/>
+      <gpu dev="2"/>
+      <net dev="1"/>
+    </channel>
+    <channel>
+      <net dev="0"/>
+      <gpu dev="1"/>
+      <gpu dev="0"/>
+      <gpu dev="7"/>
+      <gpu dev="6"/>
+      <gpu dev="5"/>
+      <gpu dev="4"/>
+      <gpu dev="3"/>
+      <gpu dev="2"/>
+      <net dev="0"/>
+    </channel>
+    <channel>
+      <net dev="2"/>
+      <gpu dev="2"/>
+      <gpu dev="3"/>
+      <gpu dev="0"/>
+      <gpu dev="1"/>
+      <gpu dev="7"/>
+      <gpu dev="6"/>
+      <gpu dev="5"/>
+      <gpu dev="4"/>
+      <net dev="2"/>
+    </channel>
+    <channel>
+      <net dev="3"/>
+      <gpu dev="2"/>
+      <gpu dev="3"/>
+      <gpu dev="0"/>
+      <gpu dev="1"/>
+      <gpu dev="7"/>
+      <gpu dev="5"/>
+      <gpu dev="6"/>
+      <gpu dev="4"/>
+      <net dev="3"/>
+    </channel>
+    <channel>
+      <net dev="5"/>
+      <gpu dev="4"/>
+      <gpu dev="5"/>
+      <gpu dev="3"/>
+      <gpu dev="2"/>
+      <gpu dev="0"/>
+      <gpu dev="1"/>
+      <gpu dev="6"/>
+      <gpu dev="7"/>
+      <net dev="5"/>
+    </channel>
+    <channel>
+      <net dev="6"/>
+      <gpu dev="4"/>
+      <gpu dev="5"/>
+      <gpu dev="3"/>
+      <gpu dev="0"/>
+      <gpu dev="7"/>
+      <gpu dev="2"/>
+      <gpu dev="1"/>
+      <gpu dev="6"/>
+      <net dev="6"/>
+    </channel>
+    <channel>
+      <net dev="7"/>
+      <gpu dev="6"/>
+      <gpu dev="7"/>
+      <gpu dev="5"/>
+      <gpu dev="3"/>
+      <gpu dev="1"/>
+      <gpu dev="4"/>
+      <gpu dev="2"/>
+      <gpu dev="0"/>
+      <net dev="7"/>
+    </channel>
+    <channel>
+      <net dev="8"/>
+      <gpu dev="6"/>
+      <gpu dev="7"/>
+      <gpu dev="5"/>
+      <gpu dev="2"/>
+      <gpu dev="0"/>
+      <gpu dev="4"/>
+      <gpu dev="3"/>
+      <gpu dev="1"/>
+      <net dev="8"/>
+    </channel>
+  </graph>
+  <graph id="2" pattern="3" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
+  <graph id="3" pattern="5" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
+</graphs>
--- a/graph_debug.xml
+++ b/graph_debug.xml
+<graphs version="1">
+  <graph id="0" pattern="4" crossnic="0" nchannels="8" speedintra="24" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PXB" samechannels="0">
+    <channel>
+      <net dev="0"/>
+      <gpu dev="0"/>
+      <gpu dev="2"/>
+      <gpu dev="3"/>
+      <gpu dev="4"/>
+      <gpu dev="5"/>
+      <gpu dev="6"/>
+      <gpu dev="7"/>
+      <gpu dev="1"/>
+      <net dev="0"/>
+    </channel>
+    <channel>
+      <net dev="1"/>
+      <gpu dev="0"/>
+      <gpu dev="3"/>
+      <gpu dev="5"/>
+      <gpu dev="7"/>
+      <gpu dev="2"/>
+      <gpu dev="4"/>
+      <gpu dev="6"/>
+      <gpu dev="1"/>
+      <net dev="1"/>
+    </channel>
+    <channel>
+      <net dev="2"/>
+      <gpu dev="2"/>
+      <gpu dev="0"/>
+      <gpu dev="1"/>
+      <gpu dev="4"/>
+      <gpu dev="7"/>
+      <gpu dev="6"/>
+      <gpu dev="5"/>
+      <gpu dev="3"/>
+      <net dev="2"/>
+    </channel>
+    <channel>
+      <net dev="3"/>
+      <gpu dev="2"/>
+      <gpu dev="1"/>
+      <gpu dev="6"/>
+      <gpu dev="0"/>
+      <gpu dev="7"/>
+      <gpu dev="5"/>
+      <gpu dev="4"/>
+      <gpu dev="3"/>
+      <net dev="3"/>
+    </channel>
+    <channel>
+      <net dev="4"/>
+      <gpu dev="4"/>
+      <gpu dev="1"/>
+      <gpu dev="7"/>
+      <gpu dev="3"/>
+      <gpu dev="0"/>
+      <gpu dev="6"/>
+      <gpu dev="2"/>
+      <gpu dev="5"/>
+      <net dev="4"/>
+    </channel>
+    <channel>
+      <net dev="5"/>
+      <gpu dev="4"/>
+      <gpu dev="3"/>
+      <gpu dev="1"/>
+      <gpu dev="6"/>
+      <gpu dev="0"/>
+      <gpu dev="7"/>
+      <gpu dev="2"/>
+      <gpu dev="5"/>
+      <net dev="5"/>
+    </channel>
+    <channel>
+      <net dev="6"/>
+      <gpu dev="6"/>
+      <gpu dev="3"/>
+      <gpu dev="1"/>
+      <gpu dev="5"/>
+      <gpu dev="0"/>
+      <gpu dev="4"/>
+      <gpu dev="2"/>
+      <gpu dev="7"/>
+      <net dev="6"/>
+    </channel>
+    <channel>
+      <net dev="7"/>
+      <gpu dev="7"/>
+      <gpu dev="4"/>
+      <gpu dev="0"/>
+      <gpu dev="5"/>
+      <gpu dev="1"/>
+      <gpu dev="3"/>
+      <gpu dev="2"/>
+      <gpu dev="6"/>
+      <net dev="7"/>
+    </channel>
+  </graph>
+  <graph id="1" pattern="1" crossnic="0" nchannels="5" speedintra="24" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PXB" samechannels="0">
+    <channel>
+      <net dev="0"/>
+      <gpu dev="0"/>
+      <gpu dev="1"/>
+      <gpu dev="7"/>
+      <gpu dev="6"/>
+      <gpu dev="5"/>
+      <gpu dev="4"/>
+      <gpu dev="3"/>
+      <gpu dev="2"/>
+      <net dev="0"/>
+    </channel>
+    <channel>
+      <net dev="1"/>
+      <gpu dev="1"/>
+      <gpu dev="0"/>
+      <gpu dev="7"/>
+      <gpu dev="5"/>
+      <gpu dev="3"/>
+      <gpu dev="6"/>
+      <gpu dev="4"/>
+      <gpu dev="2"/>
+      <net dev="1"/>
+    </channel>
+    <channel>
+      <net dev="2"/>
+      <gpu dev="2"/>
+      <gpu dev="3"/>
+      <gpu dev="1"/>
+      <gpu dev="6"/>
+      <gpu dev="0"/>
+      <gpu dev="5"/>
+      <gpu dev="7"/>
+      <gpu dev="4"/>
+      <net dev="2"/>
+    </channel>
+    <channel>
+      <net dev="4"/>
+      <gpu dev="4"/>
+      <gpu dev="5"/>
+      <gpu dev="2"/>
+      <gpu dev="1"/>
+      <gpu dev="3"/>
+      <gpu dev="7"/>
+      <gpu dev="0"/>
+      <gpu dev="6"/>
+      <net dev="4"/>
+    </channel>
+    <channel>
+      <net dev="6"/>
+      <gpu dev="6"/>
+      <gpu dev="7"/>
+      <gpu dev="3"/>
+      <gpu dev="0"/>
+      <gpu dev="4"/>
+      <gpu dev="1"/>
+      <gpu dev="2"/>
+      <gpu dev="5"/>
+      <net dev="6"/>
+    </channel>
+  </graph>
+  <graph id="2" pattern="3" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
+  <graph id="3" pattern="5" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
+</graphs>
--- a/hostfile
+++ b/hostfile
+10.17.26.196 slots=8
+10.17.26.195 slots=8
--- a/ib_counter_4ports_python3
+++ b/ib_counter_4ports_python3
+#!/usr/bin/env python3
+import sys
+import os
+import time
+infiniband_path="/sys/class/infiniband"
+port_counter_list = ['port_rcv_data',
+                     'port_rcv_packets',
+                     'port_xmit_data',
+                     'port_xmit_packets']
+HCA_list = []
+HCA_num  = 0
+All_counter=[]
+path_list=os.listdir(infiniband_path)
+path_list.sort()
+def init_ib_port_counter():
+    #for HCA in os.listdir(infiniband_path):
+    for HCA in path_list:
+       # print HCA
+       # if HCA == 'mlx5_2' or HCA == 'mlx5_3':
+       #    continue
+        HCA_dict = dict(name=HCA, port_list=[])
+        HCA_path=os.path.join(infiniband_path, HCA, 'ports')
+        for port in os.listdir(HCA_path):
+            port_path = os.path.join(HCA_path, port)
+            #print port_path
+            port_dict = dict(name=port, counter_list=[])
+            #print port_dict
+            for counter in port_counter_list:
+                counter_path = os.path.join(port_path, 'counters', counter)
+                #print counter_path
+                fh = open(counter_path)
+                counter_value = int(fh.read())
+                counter_dict = dict(name=counter, path=counter_path, fh=fh, bandwidth=0.0, value=counter_value, pre_value=counter_value)
+                port_dict['counter_list'].append(counter_dict)
+                #print item_dict
+            #print port_dict
+            HCA_dict['port_list'].append(port_dict)
+        HCA_list.append(HCA_dict)
+    # counter of all port
+    for counter in port_counter_list:
+        counter_dict = dict(name=counter, bandwidth=0.0)
+        All_counter.append(counter_dict)
+def get_ib_port_bandwidth(sleep_sec):
+    #reset All counter
+    for counter in All_counter:
+        counter['bandwidth'] = 0.0
+    for HCA in HCA_list:
+        #print HCA['name']
+        for port in HCA['port_list']:
+            #print port['name']
+            counter_id = 0
+            for counter in port['counter_list']:
+                #print counter['name']
+                counter['fh'].seek(0)
+                counter['value'] = int(counter['fh'].read())
+                counter['bandwidth'] = (counter['value'] - counter['pre_value'])/sleep_sec
+                counter['pre_value'] = counter['value']
+                if counter['name'] == 'port_rcv_data' or counter['name'] == 'port_xmit_data' :
+                    counter['bandwidth'] = counter['bandwidth']/1024.0/1024.0*4*8
+                #print counter['name'], All_counter[counter_id]['name']
+                All_counter[counter_id]['bandwidth'] += counter['bandwidth']
+                counter_id += 1
+def print_ib_port_bandwidth():
+    title_str = '{:^10}{:^10}{:^15}{:^15}{:^15}{:^15}'.format('HCA', 'port', 'recv_data/Mbps', 'recv_pkts', 'xmit_data/Mbps', 'xmit_pkts')
+    title_str += '{:^22}{:^22}'.format('rcv_pkt_avg_size/Byte', 'xmit_pkt_avg_size/Byte')
+    print (title_str)
+    sep_line = '{:-^80}'.format('-')
+    sep_line += '{:-^44}'.format('-')
+    print (sep_line)
+    #print "%5s %10s %10s %10s %10s %10s" % ('HCA', 'port', 'recv_data', 'recv_pkts', 'xmit_data', 'xmit_pkts')
+    for HCA in HCA_list:
+        for port in HCA['port_list']:
+            data_str = '{:^10}{:^10}'.format(HCA['name'], port['name'])
+            for counter in port['counter_list']:
+                data_str += '{:^15.2f}'.format(counter['bandwidth'])
+            if port['counter_list'][1]['bandwidth'] != 0 :
+                rcv_pkt_avg_size = port['counter_list'][0]['bandwidth']*1024*1024/8/port['counter_list'][1]['bandwidth']
+            else:
+                rcv_pkt_avg_size = 0.0
+            if port['counter_list'][3]['bandwidth'] != 0 :
+                xmit_pkt_avg_size = port['counter_list'][2]['bandwidth']*1024*1024/8/port['counter_list'][3]['bandwidth']
+            else:
+                xmit_pkt_avg_size = 0.0       
+            data_str += '{:^22.2f}'.format(rcv_pkt_avg_size)
+            data_str += '{:^22.2f}'.format(xmit_pkt_avg_size)
+            print (data_str)
+    print (sep_line)
+    data_str = '{:^10}{:^10}'.format('All',' ')
+    for counter in All_counter:
+        data_str += '{:^15.2f}'.format(counter['bandwidth'])
+    if All_counter[1]['bandwidth'] != 0.0 :
+        rcv_pkt_avg_size = All_counter[0]['bandwidth']*1024*1024/8/All_counter[1]['bandwidth']
+    else:
+        rcv_pkt_avg_size = 0.0
+    if All_counter[3]['bandwidth'] != 0.0 :
+        xmit_pkt_avg_size = All_counter[2]['bandwidth']*1024*1024/8/All_counter[3]['bandwidth']
+    else:
+        xmit_pkt_avg_size = 0.0
+    data_str += '{:^22.2f}'.format(rcv_pkt_avg_size)
+    data_str += '{:^22.2f}'.format(xmit_pkt_avg_size)
+    print (data_str)
+    print ('\n')
+if __name__ == '__main__' :
+    if len(sys.argv) > 1 :               
+        sleep_sec = float(sys.argv[1])
+    else:
+        sleep_sec = 1.0
+    init_ib_port_counter()
+    while(True):
+        time.sleep(sleep_sec)
+        get_ib_port_bandwidth(sleep_sec)
+        print_ib_port_bandwidth()
--- a/install.sh
+++ b/install.sh
+#!/bin/bash
+# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
+# #################################################
+# helper functions
+# #################################################
+function display_help()
+{
+    echo "RCCL-tests build & installation helper script"
+    echo "./install [-h|--help] "
+    echo "    [-h|--help] Prints this help message."
+    echo "    [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)"
+    echo "    [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)"
+    echo "    [--mpi_home] Specify path to your MPI installation."
+}
+# #################################################
+# global variables
+# #################################################
+run_tests=false
+build_release=true
+mpi_enabled=false
+rccl_dir=/opt/rocm/rccl
+mpi_dir=""
+# #################################################
+# Parameter parsing
+# #################################################
+# check if we have a modern version of getopt that can handle whitespace and long parameters
+getopt -T
+if [[ $? -eq 4 ]]; then
+    GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@")
+else
+    echo "Need a new version of getopt"
+    exit 1
+fi
+if [[ $? -ne 0 ]]; then
+    echo "getopt invocation failed; could not parse the command line";
+    exit 1
+fi
+eval set -- "${GETOPT_PARSE}"
+while true; do
+    case "${1}" in
+	-h|--help)
+        display_help
+        exit 0
+        ;;
+	-m|--mpi)
+	    mpi_enabled=true
+	    shift ;;
+	-t|--test)
+	    run_tests=true
+	    shift ;;
+    --rccl_home)
+        rccl_dir=${2}
+        shift 2 ;;
+    --mpi_home)
+        mpi_dir=${2}
+        shift 2 ;;
+	--) shift ; break ;;
+	*)  echo "Unexpected command line parameter received; aborting";
+	    exit 1
+	    ;;
+    esac
+    done
+# throw error code after running a command in the install script
+check_exit_code( )
+{
+  if (( $1 != 0 )); then
+    exit $1
+  fi
+}
+# Install the pre-commit hook
+#bash ./githooks/install
+build_dir=./build
+# #################################################
+# prep
+# #################################################
+# ensure a clean build environment
+rm -rf ${build_dir}
+if ($mpi_enabled); then
+    if [[ ${mpi_dir} == "" ]]; then
+        echo "MPI flag enabled but path to MPI installation not specified.  See --mpi_home command line argument."
+        exit 1
+    else
+        make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc)
+    fi
+else
+    make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc)
+fi
+check_exit_code "$?"
+# Optionally, run tests if they're enabled.
+if ($run_tests); then
+    if ($mpi_enabled); then
+        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
+    else
+        cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest
+    fi
+fi
--- a/log.log
+++ b/log.log
--- a/log_run.log
+++ b/log_run.log