Commit c0dad530 authored by wangkaixiong's avatar wangkaixiong 🚴🏼
Browse files

init

parents
# ########################################################################
# Copyright 2022 Advanced Micro Devices, Inc.
# ########################################################################
#Adding pthread flag for linking
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
if (MPI_MPICXX)
message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
find_file(MPI_H mpi.h PATHS ${mpi_inc_dir} NO_DEFAULT_PATH)
message ("-- mpi.h is in ${MPI_H}")
find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_lib_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu NO_DEFAULT_PATH)
message ("-- libmpi is ${MPI_LIB}")
if (NOT MPI_H OR NOT MPI_LIB)
set (MPI_MPICXX "MPI_MPICXX-NOTFOUND")
set (MPI_H "MPI_H-NOTFOUND")
set (MPI_LIB "MPI_LIB-NOTFOUND")
else()
add_definitions(-DMPI_SUPPORT)
include_directories(${mpi_inc_dir})
link_libraries(${MPI_LIB})
endif()
else()
message ("-- ${mpi_compiler} not found")
endif()
endmacro()
cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
project(RCCL-tests VERSION 2.12.10 LANGUAGES CXX)
# Get ROCm path from environment if available
if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
else()
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
endif()
# Set CMake/CPack variables
list( APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm)
set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Prefix install path")
set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.")
set(CMAKE_CXX_STANDARD 14)
# Get additional packages required
find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}")
include(ROCMSetupVersion)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
include(ROCMCheckTargetIds)
include(ROCMClients)
# Build variables
option(NO_MPI "Build RCCL-tests without MPI support.")
option(MPI_PATH "Use MPI in the specified directory.")
# Default GPU architectures to build
#==================================================================================================
set(DEFAULT_GPUS
gfx803
gfx900:xnack-
gfx906:xnack-
gfx908:xnack-
gfx90a:xnack-
gfx90a:xnack+
gfx940
gfx941
gfx942
gfx1030
gfx1100
gfx1101
gfx1102)
set(AMDGPU_TARGETS ${DEFAULT_GPUS} CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined.")
## Determine which GPU architectures to build for
if (COMMAND rocm_check_target_ids)
message(STATUS "Checking for ROCm support for GPU targets:")
rocm_check_target_ids(SUPPORTED_GPUS TARGETS "${AMDGPU_TARGETS}")
else()
message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs")
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif()
set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architectures to build for.")
message(STATUS "Compiling for ${GPU_TARGETS}")
find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
if (NOT NO_MPI)
# CHECK for MPI Path first. User requested this directory explicitely
if (MPI_PATH)
set(mpi_spec_bin_dir "${MPI_PATH}/bin")
set(mpi_spec_inc_dir "${MPI_PATH}/include")
check_mpi(mpicxx libmpi.a libmpi.so ${mpi_spec_bin_dir} ${MPI_PATH} ${mpi_spec_inc_dir})
if (NOT MPI_MPICXX)
# Since the user explicitely requested this directory, abort if something went wrong.
MESSAGE(FATAL_ERROR "Could not find MPI in ${MPI_PATH}")
endif()
endif()
# Check for MPICH Ubuntu installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx.mpich libmpich.a libmpich.so /usr/bin /usr /usr/include/x86_64-linux-gnu/mpich)
endif()
# Check for Open MPI Ubuntu installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx.openmpi libmpi.a libmpi.so /usr/bin /usr/lib/x86_64-linux-gnu/openmpi /usr/lib/x86_64-linux-gnu/openmpi/include)
endif()
# Check for MPICH RHEL installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpich/bin /usr/lib64/mpich /usr/include/mpich-x86_64)
endif()
# Check for Open MPI RHEL installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
endif()
# Check for MPICH SLES installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpi/gcc/mpich/bin /usr/lib64/mpi/gcc/mpich /usr/lib64/mpi/gcc/mpich/include)
endif()
# Check for Open MPI v4 SLES installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi4/bin /usr/lib64/mpi/gcc/openmpi4 /usr/lib64/mpi/gcc/openmpi4/include)
endif()
# Check for Open MPI v3 SLES installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi3/bin /usr/lib64/mpi/gcc/openmpi3 /usr/lib64/mpi/gcc/openmpi3/include)
endif()
# Check for Open MPI v2 SLES installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi2/bin /usr/lib64/mpi/gcc/openmpi2 /usr/lib64/mpi/gcc/openmpi2/include)
endif()
if (NOT MPI_MPICXX)
message ("-- no MPI library found")
endif()
else()
message ("-- MPI support explicitely disabled")
endif()
set(ROCM_USE_DEV_COMPONENT OFF) # This repo doesn't have a dev component
# Add all of the tests
add_subdirectory(src)
# Create ROCm standard packages
rocm_create_package(
NAME rccl-tests
DESCRIPTION "Tests for the ROCm Communication Collectives Library"
MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
)
Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
#
# See LICENCE.txt for license information
#
BUILDDIR ?= build
override BUILDDIR := $(abspath $(BUILDDIR))
.PHONY: all clean
default: src.build
TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
all: ${TARGETS:%=%.build}
clean: ${TARGETS:%=%.clean}
%.build:
${MAKE} -C $* build BUILDDIR=${BUILDDIR}
%.clean:
${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
Notices and Licenses file
_______________________________________________________________
Dependencies on nvidia-nccl-tests v2.0.0 (BSD3)
Copyright (c) 2016-2017, NVIDIA CORPORATION.
Modifications Copyright (c) 2019 Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
nvidia-nccl-tests v2.0.0 (BSD2)
Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
# RCCL Tests
These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl).
## Build
To build the tests, just type `make`.
If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify NCCL\_HOME and CUSTOM\_RCCL\_LIB.
```shell
$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rccl/lib/librccl.so
```
RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
```shell
$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
```
RCCL tests can also be built using cmake. A typical sequence will be:
```shell
$ mkdir build
$ cd build
$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl ..
$ make
```
When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
for cmake target and config files that are created during the RCCL build.
Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitly request
MPI builds. A user can request to use a particular MPI library by using the MPI_PATH variable. MPI support can be explicitely disabled by adding the -DNO_MPI=1
flag to the cmake command line.
## Usage
RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
### Quick examples
Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
```shell
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
```
Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
```shell
$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
```
### Performance
See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.
### Arguments
All tests support the same set of arguments :
* Number of GPUs
* `-t,--nthreads <num threads>` number of threads per process. Default : 1.
* `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
* Sizes to scan
* `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
* `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
* Increments can be either fixed or a multiplication factor. Only one of those should be used
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
* `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
* RCCL operations arguments
* `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
* `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
* `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
* `-y,--memory_type <coarse/fine/host/managed>` Default: Coarse
* `-s,--stress_cycles <number of cycles>` Default: 1
* `-u,--cumask <d0,d1,d2,d3>` Default: None
* Performance
* `-n,--iters <iteration count>` number of iterations. Default : 20.
* `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
* `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
* `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
* Test operation
* `-p,--parallel_init <0/1>` use threads to initialize RCCL in parallel. Default : 0.
* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
* `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
* `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
## Unit tests
Unit tests for rccl-tests are implemented with pytest (python3 is also required). Several notes for the unit tests:
1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.
The unit tests can be invoked within the rccl-tests root, or in the test subfolder. An example call to the unit tests:
```shell
$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest
```
## Copyright
RCCL tests are provided under the BSD license.
All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
This diff is collapsed.
#!/bin/bash
rm -rf ./build
make MPI=1 MPI_HOME=/opt/mpi HIP_HOME=/opt/dtk NCCL_HOME=/opt/dtk/rccl/ CUSTOM_RCCL_LIB=/opt/dtk/rccl/lib/librccl.so -j 32 2>&1 | tee build.log
\ No newline at end of file
File added
# Performance reported by RCCL tests
RCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
# Time
Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations.
On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but
also the bandwidth multiplied by the size.
Therefore, on large sizes, it makes more sense to look at the bandwidth.
# Bandwidth
## Algorithm bandwidth
Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth.
`algbw = S/t`
## Bus bandwidth
While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks.
Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase).
To provide a number which reflects how optimally the hardware is used, RCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication.
Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
The formula depends on the collective operation.
### AllReduce
An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation :
`o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}`
**Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).**
A ring would do that operation in an order which follows the ring :
`i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}`
A tree would do it hierarchically :
`(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output),
we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation.
Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best :
`t = (S*2*(n-1)) / (n*B)`
Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them.
Reordering the equation, we find that
`t = (S/B) * (2*(n-1)/n)`
Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute :
`B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)`
### ReduceScatter
The ReduceScatter operation requires only to perform the addition part of the allReduce operation :
`o_K = i_0 + i_1 + i_2 + ... + i_{n-1}`
With K being the rank which is getting the final result(K=offset/recvsize).
The perfect reduceScatter time with a rank bandwidth of B would therefore be :
`t = S*(n-1) / (B*n)`
And the Bus Bandwidth is therefore computed as :
`B = S/t * (n-1)/n = algbw * (n-1)/n`
Note that here, S is the size in bytes of the total array, which for RCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
### AllGather
The AllGather operation requires only to perform the assignment part of the allReduce operation :
`o_0 = o_1 = o_2 = ... = o_{n-1} = i_K`
With K being the rank where the data originates from (K=offset*sendsize).
The perfect allGather time with a rank bandwidth of B would therefore be :
`t = S*(n-1) / (B*n)`
And the Bus Bandwidth is therefore computed as :
`B = S/t * (n-1)/n = algbw * (n-1)/n`
Note that here, S is the size in bytes of the total array, which for RCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
### Broadcast
The broadcast operation representation is similar to allGather :
`o_0 = o_1 = o_2 = ... = o_{n-1} = i_R`
R being the root of the operation.
However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations.
Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out :
`t = S/B`
And :
`B = S/t`
### Reduce
The reduce operation performs :
`o_R = i_0 + i_1 + i_2 + ... + i_{n-1}`
R being the root of the operation.
Similarly to broadcast, all data need to be sent to the root, hence :
`t = S/B`
And :
`B = S/t`
### Summary
To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth :
* AllReduce : 2*(_n_-1)/_n_
* ReduceScatter : (_n_-1)/_n_
* AllGather : (_n_-1)/_n_
* Broadcast : 1
* Reduce : 1
The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# This Dockerfile provides a starting point for a ROCm installation of rccl
# Parameters related to building rccl
ARG base_image
FROM ${base_image}
LABEL maintainer="rccl-maintainer@amd.com"
USER root
ARG user_uid
# Install dependent packages
RUN yum install -y --nogpgcheck \
sudo \
chrpath \
rock-dkms \
rocm-cmake \
centos-release-scl \
devtoolset-7 \
ca-certificates \
git \
cmake3 \
make \
libgomp \
clang \
clang-devel \
gcc-c++ \
pkgconfig \
numactl-libs
RUN echo '#!/bin/bash' | tee /etc/profile.d/devtoolset7.sh && echo \
'source scl_source enable devtoolset-7' >>/etc/profile.d/devtoolset7.sh
# docker pipeline runs containers with particular uid
# create a jenkins user with this specific uid so it can use sudo priviledges
# Grant any member of sudo group password-less sudo privileges
RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd && \
chmod 400 /etc/sudoers.d/sudo-nopasswd
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# Parameters related to building rccl
ARG base_image
FROM ${base_image}
LABEL maintainer="rccl-maintainer@amd.com"
ARG user_uid
# Install dependent packages
# Dependencies:
# * hcc-config.cmake: pkg-config
# * tensile: python2.7, python-yaml
# * rocblas-test: gfortran, googletest
# * rocblas-bench: libboost-program-options-dev
# * libhsakmt.so: libnuma1
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
rock-dkms \
sudo \
ca-certificates \
chrpath \
git \
make \
cmake \
pkg-config \
python2.7 \
python-yaml \
python3-pytest \
rocm-cmake \
libboost-program-options-dev \
libnuma1 \
libomp-dev \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# docker pipeline runs containers with particular uid
# create a jenkins user with this specific uid so it can use sudo priviledges
# Grant any member of sudo group password-less sudo privileges
RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
mkdir -p /etc/sudoers.d/ && \
echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# Parameters related to building rccl
ARG base_image
FROM ${base_image}
LABEL maintainer="rccl-maintainer@amd.com"
#empty for now
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# Parameters related to building rccl
ARG base_image
FROM ${base_image}
LABEL maintainer="rccl-maintainer@amd.com"
#empty for now
<graphs version="1">
<graph id="0" pattern="4" crossnic="0" nchannels="8" speedintra="24" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PXB" samechannels="0">
<channel>
<net dev="1"/>
<gpu dev="1"/>
<gpu dev="3"/>
<gpu dev="5"/>
<gpu dev="7"/>
<gpu dev="2"/>
<gpu dev="4"/>
<gpu dev="6"/>
<gpu dev="0"/>
<net dev="1"/>
</channel>
<channel>
<net dev="0"/>
<gpu dev="1"/>
<gpu dev="3"/>
<gpu dev="5"/>
<gpu dev="7"/>
<gpu dev="2"/>
<gpu dev="4"/>
<gpu dev="6"/>
<gpu dev="0"/>
<net dev="0"/>
</channel>
<channel>
<net dev="2"/>
<gpu dev="2"/>
<gpu dev="0"/>
<gpu dev="1"/>
<gpu dev="5"/>
<gpu dev="7"/>
<gpu dev="4"/>
<gpu dev="6"/>
<gpu dev="3"/>
<net dev="2"/>
</channel>
<channel>
<net dev="3"/>
<gpu dev="2"/>
<gpu dev="0"/>
<gpu dev="1"/>
<gpu dev="5"/>
<gpu dev="7"/>
<gpu dev="4"/>
<gpu dev="6"/>
<gpu dev="3"/>
<net dev="3"/>
</channel>
<channel>
<net dev="5"/>
<gpu dev="4"/>
<gpu dev="0"/>
<gpu dev="3"/>
<gpu dev="1"/>
<gpu dev="2"/>
<gpu dev="7"/>
<gpu dev="6"/>
<gpu dev="5"/>
<net dev="5"/>
</channel>
<channel>
<net dev="6"/>
<gpu dev="4"/>
<gpu dev="0"/>
<gpu dev="3"/>
<gpu dev="1"/>
<gpu dev="2"/>
<gpu dev="7"/>
<gpu dev="6"/>
<gpu dev="5"/>
<net dev="6"/>
</channel>
<channel>
<net dev="7"/>
<gpu dev="6"/>
<gpu dev="0"/>
<gpu dev="3"/>
<gpu dev="5"/>
<gpu dev="1"/>
<gpu dev="2"/>
<gpu dev="4"/>
<gpu dev="7"/>
<net dev="7"/>
</channel>
<channel>
<net dev="8"/>
<gpu dev="6"/>
<gpu dev="0"/>
<gpu dev="3"/>
<gpu dev="5"/>
<gpu dev="1"/>
<gpu dev="2"/>
<gpu dev="4"/>
<gpu dev="7"/>
<net dev="8"/>
</channel>
</graph>
<graph id="1" pattern="1" crossnic="0" nchannels="8" speedintra="48" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PXB" samechannels="0">
<channel>
<net dev="1"/>
<gpu dev="1"/>
<gpu dev="0"/>
<gpu dev="7"/>
<gpu dev="6"/>
<gpu dev="5"/>
<gpu dev="4"/>
<gpu dev="3"/>
<gpu dev="2"/>
<net dev="1"/>
</channel>
<channel>
<net dev="0"/>
<gpu dev="1"/>
<gpu dev="0"/>
<gpu dev="7"/>
<gpu dev="6"/>
<gpu dev="5"/>
<gpu dev="4"/>
<gpu dev="3"/>
<gpu dev="2"/>
<net dev="0"/>
</channel>
<channel>
<net dev="2"/>
<gpu dev="2"/>
<gpu dev="3"/>
<gpu dev="0"/>
<gpu dev="1"/>
<gpu dev="7"/>
<gpu dev="6"/>
<gpu dev="5"/>
<gpu dev="4"/>
<net dev="2"/>
</channel>
<channel>
<net dev="3"/>
<gpu dev="2"/>
<gpu dev="3"/>
<gpu dev="0"/>
<gpu dev="1"/>
<gpu dev="7"/>
<gpu dev="5"/>
<gpu dev="6"/>
<gpu dev="4"/>
<net dev="3"/>
</channel>
<channel>
<net dev="5"/>
<gpu dev="4"/>
<gpu dev="5"/>
<gpu dev="3"/>
<gpu dev="2"/>
<gpu dev="0"/>
<gpu dev="1"/>
<gpu dev="6"/>
<gpu dev="7"/>
<net dev="5"/>
</channel>
<channel>
<net dev="6"/>
<gpu dev="4"/>
<gpu dev="5"/>
<gpu dev="3"/>
<gpu dev="0"/>
<gpu dev="7"/>
<gpu dev="2"/>
<gpu dev="1"/>
<gpu dev="6"/>
<net dev="6"/>
</channel>
<channel>
<net dev="7"/>
<gpu dev="6"/>
<gpu dev="7"/>
<gpu dev="5"/>
<gpu dev="3"/>
<gpu dev="1"/>
<gpu dev="4"/>
<gpu dev="2"/>
<gpu dev="0"/>
<net dev="7"/>
</channel>
<channel>
<net dev="8"/>
<gpu dev="6"/>
<gpu dev="7"/>
<gpu dev="5"/>
<gpu dev="2"/>
<gpu dev="0"/>
<gpu dev="4"/>
<gpu dev="3"/>
<gpu dev="1"/>
<net dev="8"/>
</channel>
</graph>
<graph id="2" pattern="3" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
<graph id="3" pattern="5" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
</graphs>
<graphs version="1">
<graph id="0" pattern="4" crossnic="0" nchannels="8" speedintra="24" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PXB" samechannels="0">
<channel>
<net dev="0"/>
<gpu dev="0"/>
<gpu dev="2"/>
<gpu dev="3"/>
<gpu dev="4"/>
<gpu dev="5"/>
<gpu dev="6"/>
<gpu dev="7"/>
<gpu dev="1"/>
<net dev="0"/>
</channel>
<channel>
<net dev="1"/>
<gpu dev="0"/>
<gpu dev="3"/>
<gpu dev="5"/>
<gpu dev="7"/>
<gpu dev="2"/>
<gpu dev="4"/>
<gpu dev="6"/>
<gpu dev="1"/>
<net dev="1"/>
</channel>
<channel>
<net dev="2"/>
<gpu dev="2"/>
<gpu dev="0"/>
<gpu dev="1"/>
<gpu dev="4"/>
<gpu dev="7"/>
<gpu dev="6"/>
<gpu dev="5"/>
<gpu dev="3"/>
<net dev="2"/>
</channel>
<channel>
<net dev="3"/>
<gpu dev="2"/>
<gpu dev="1"/>
<gpu dev="6"/>
<gpu dev="0"/>
<gpu dev="7"/>
<gpu dev="5"/>
<gpu dev="4"/>
<gpu dev="3"/>
<net dev="3"/>
</channel>
<channel>
<net dev="4"/>
<gpu dev="4"/>
<gpu dev="1"/>
<gpu dev="7"/>
<gpu dev="3"/>
<gpu dev="0"/>
<gpu dev="6"/>
<gpu dev="2"/>
<gpu dev="5"/>
<net dev="4"/>
</channel>
<channel>
<net dev="5"/>
<gpu dev="4"/>
<gpu dev="3"/>
<gpu dev="1"/>
<gpu dev="6"/>
<gpu dev="0"/>
<gpu dev="7"/>
<gpu dev="2"/>
<gpu dev="5"/>
<net dev="5"/>
</channel>
<channel>
<net dev="6"/>
<gpu dev="6"/>
<gpu dev="3"/>
<gpu dev="1"/>
<gpu dev="5"/>
<gpu dev="0"/>
<gpu dev="4"/>
<gpu dev="2"/>
<gpu dev="7"/>
<net dev="6"/>
</channel>
<channel>
<net dev="7"/>
<gpu dev="7"/>
<gpu dev="4"/>
<gpu dev="0"/>
<gpu dev="5"/>
<gpu dev="1"/>
<gpu dev="3"/>
<gpu dev="2"/>
<gpu dev="6"/>
<net dev="7"/>
</channel>
</graph>
<graph id="1" pattern="1" crossnic="0" nchannels="5" speedintra="24" speedinter="24" latencyinter="0" typeintra="NVL" typeinter="PXB" samechannels="0">
<channel>
<net dev="0"/>
<gpu dev="0"/>
<gpu dev="1"/>
<gpu dev="7"/>
<gpu dev="6"/>
<gpu dev="5"/>
<gpu dev="4"/>
<gpu dev="3"/>
<gpu dev="2"/>
<net dev="0"/>
</channel>
<channel>
<net dev="1"/>
<gpu dev="1"/>
<gpu dev="0"/>
<gpu dev="7"/>
<gpu dev="5"/>
<gpu dev="3"/>
<gpu dev="6"/>
<gpu dev="4"/>
<gpu dev="2"/>
<net dev="1"/>
</channel>
<channel>
<net dev="2"/>
<gpu dev="2"/>
<gpu dev="3"/>
<gpu dev="1"/>
<gpu dev="6"/>
<gpu dev="0"/>
<gpu dev="5"/>
<gpu dev="7"/>
<gpu dev="4"/>
<net dev="2"/>
</channel>
<channel>
<net dev="4"/>
<gpu dev="4"/>
<gpu dev="5"/>
<gpu dev="2"/>
<gpu dev="1"/>
<gpu dev="3"/>
<gpu dev="7"/>
<gpu dev="0"/>
<gpu dev="6"/>
<net dev="4"/>
</channel>
<channel>
<net dev="6"/>
<gpu dev="6"/>
<gpu dev="7"/>
<gpu dev="3"/>
<gpu dev="0"/>
<gpu dev="4"/>
<gpu dev="1"/>
<gpu dev="2"/>
<gpu dev="5"/>
<net dev="6"/>
</channel>
</graph>
<graph id="2" pattern="3" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
<graph id="3" pattern="5" crossnic="0" nchannels="0" speedintra="0" speedinter="0" latencyinter="0" typeintra="LOC" typeinter="LOC" samechannels="0"/>
</graphs>
10.17.26.196 slots=8
10.17.26.195 slots=8
#!/usr/bin/env python3
import sys
import os
import time
infiniband_path="/sys/class/infiniband"
port_counter_list = ['port_rcv_data',
'port_rcv_packets',
'port_xmit_data',
'port_xmit_packets']
HCA_list = []
HCA_num = 0
All_counter=[]
path_list=os.listdir(infiniband_path)
path_list.sort()
def init_ib_port_counter():
#for HCA in os.listdir(infiniband_path):
for HCA in path_list:
# print HCA
# if HCA == 'mlx5_2' or HCA == 'mlx5_3':
# continue
HCA_dict = dict(name=HCA, port_list=[])
HCA_path=os.path.join(infiniband_path, HCA, 'ports')
for port in os.listdir(HCA_path):
port_path = os.path.join(HCA_path, port)
#print port_path
port_dict = dict(name=port, counter_list=[])
#print port_dict
for counter in port_counter_list:
counter_path = os.path.join(port_path, 'counters', counter)
#print counter_path
fh = open(counter_path)
counter_value = int(fh.read())
counter_dict = dict(name=counter, path=counter_path, fh=fh, bandwidth=0.0, value=counter_value, pre_value=counter_value)
port_dict['counter_list'].append(counter_dict)
#print item_dict
#print port_dict
HCA_dict['port_list'].append(port_dict)
HCA_list.append(HCA_dict)
# counter of all port
for counter in port_counter_list:
counter_dict = dict(name=counter, bandwidth=0.0)
All_counter.append(counter_dict)
def get_ib_port_bandwidth(sleep_sec):
#reset All counter
for counter in All_counter:
counter['bandwidth'] = 0.0
for HCA in HCA_list:
#print HCA['name']
for port in HCA['port_list']:
#print port['name']
counter_id = 0
for counter in port['counter_list']:
#print counter['name']
counter['fh'].seek(0)
counter['value'] = int(counter['fh'].read())
counter['bandwidth'] = (counter['value'] - counter['pre_value'])/sleep_sec
counter['pre_value'] = counter['value']
if counter['name'] == 'port_rcv_data' or counter['name'] == 'port_xmit_data' :
counter['bandwidth'] = counter['bandwidth']/1024.0/1024.0*4*8
#print counter['name'], All_counter[counter_id]['name']
All_counter[counter_id]['bandwidth'] += counter['bandwidth']
counter_id += 1
def print_ib_port_bandwidth():
title_str = '{:^10}{:^10}{:^15}{:^15}{:^15}{:^15}'.format('HCA', 'port', 'recv_data/Mbps', 'recv_pkts', 'xmit_data/Mbps', 'xmit_pkts')
title_str += '{:^22}{:^22}'.format('rcv_pkt_avg_size/Byte', 'xmit_pkt_avg_size/Byte')
print (title_str)
sep_line = '{:-^80}'.format('-')
sep_line += '{:-^44}'.format('-')
print (sep_line)
#print "%5s %10s %10s %10s %10s %10s" % ('HCA', 'port', 'recv_data', 'recv_pkts', 'xmit_data', 'xmit_pkts')
for HCA in HCA_list:
for port in HCA['port_list']:
data_str = '{:^10}{:^10}'.format(HCA['name'], port['name'])
for counter in port['counter_list']:
data_str += '{:^15.2f}'.format(counter['bandwidth'])
if port['counter_list'][1]['bandwidth'] != 0 :
rcv_pkt_avg_size = port['counter_list'][0]['bandwidth']*1024*1024/8/port['counter_list'][1]['bandwidth']
else:
rcv_pkt_avg_size = 0.0
if port['counter_list'][3]['bandwidth'] != 0 :
xmit_pkt_avg_size = port['counter_list'][2]['bandwidth']*1024*1024/8/port['counter_list'][3]['bandwidth']
else:
xmit_pkt_avg_size = 0.0
data_str += '{:^22.2f}'.format(rcv_pkt_avg_size)
data_str += '{:^22.2f}'.format(xmit_pkt_avg_size)
print (data_str)
print (sep_line)
data_str = '{:^10}{:^10}'.format('All',' ')
for counter in All_counter:
data_str += '{:^15.2f}'.format(counter['bandwidth'])
if All_counter[1]['bandwidth'] != 0.0 :
rcv_pkt_avg_size = All_counter[0]['bandwidth']*1024*1024/8/All_counter[1]['bandwidth']
else:
rcv_pkt_avg_size = 0.0
if All_counter[3]['bandwidth'] != 0.0 :
xmit_pkt_avg_size = All_counter[2]['bandwidth']*1024*1024/8/All_counter[3]['bandwidth']
else:
xmit_pkt_avg_size = 0.0
data_str += '{:^22.2f}'.format(rcv_pkt_avg_size)
data_str += '{:^22.2f}'.format(xmit_pkt_avg_size)
print (data_str)
print ('\n')
if __name__ == '__main__' :
if len(sys.argv) > 1 :
sleep_sec = float(sys.argv[1])
else:
sleep_sec = 1.0
init_ib_port_counter()
while(True):
time.sleep(sleep_sec)
get_ib_port_bandwidth(sleep_sec)
print_ib_port_bandwidth()
#!/bin/bash
# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
# #################################################
# helper functions
# #################################################
function display_help()
{
echo "RCCL-tests build & installation helper script"
echo "./install [-h|--help] "
echo " [-h|--help] Prints this help message."
echo " [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)"
echo " [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)"
echo " [--mpi_home] Specify path to your MPI installation."
}
# #################################################
# global variables
# #################################################
run_tests=false
build_release=true
mpi_enabled=false
rccl_dir=/opt/rocm/rccl
mpi_dir=""
# #################################################
# Parameter parsing
# #################################################
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@")
else
echo "Need a new version of getopt"
exit 1
fi
if [[ $? -ne 0 ]]; then
echo "getopt invocation failed; could not parse the command line";
exit 1
fi
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
-h|--help)
display_help
exit 0
;;
-m|--mpi)
mpi_enabled=true
shift ;;
-t|--test)
run_tests=true
shift ;;
--rccl_home)
rccl_dir=${2}
shift 2 ;;
--mpi_home)
mpi_dir=${2}
shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
# throw error code after running a command in the install script
check_exit_code( )
{
if (( $1 != 0 )); then
exit $1
fi
}
# Install the pre-commit hook
#bash ./githooks/install
build_dir=./build
# #################################################
# prep
# #################################################
# ensure a clean build environment
rm -rf ${build_dir}
if ($mpi_enabled); then
if [[ ${mpi_dir} == "" ]]; then
echo "MPI flag enabled but path to MPI installation not specified. See --mpi_home command line argument."
exit 1
else
make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc)
fi
else
make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc)
fi
check_exit_code "$?"
# Optionally, run tests if they're enabled.
if ($run_tests); then
if ($mpi_enabled); then
cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
else
cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest
fi
fi
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment