"vscode:/vscode.git/clone" did not exist on "fdca30739b01a6dfb732f23289701d0e9860d39f"
Commit d8ca0a9e authored by jerrrrry's avatar jerrrrry
Browse files

Initial commit

parents
# ########################################################################
# Copyright 2022 Advanced Micro Devices, Inc.
# ########################################################################
#Adding pthread flag for linking
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
if (MPI_MPICXX)
message ("-- ${mpi_compiler} found @ ${MPI_MPICXX}")
find_file(MPI_H mpi.h PATHS ${mpi_inc_dir} NO_DEFAULT_PATH)
message ("-- mpi.h is in ${MPI_H}")
find_file(MPI_LIB NAMES ${mpi_lib_so} ${mpi_lib_a} PATHS ${mpi_base_lib_dir} PATH_SUFFIXES lib lib64 lib/x86_64-linux-gnu NO_DEFAULT_PATH)
message ("-- libmpi is ${MPI_LIB}")
if (NOT MPI_H OR NOT MPI_LIB)
set (MPI_MPICXX "MPI_MPICXX-NOTFOUND")
set (MPI_H "MPI_H-NOTFOUND")
set (MPI_LIB "MPI_LIB-NOTFOUND")
else()
add_definitions(-DMPI_SUPPORT)
include_directories(${mpi_inc_dir})
link_libraries(${MPI_LIB})
endif()
else()
message ("-- ${mpi_compiler} not found")
endif()
endmacro()
cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
project(RCCL-tests VERSION 2.12.10 LANGUAGES CXX)
# Get ROCm path from environment if available
if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
else()
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
endif()
# Set CMake/CPack variables
list( APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/llvm)
set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Prefix install path")
set(CPACK_PACKAGING_INSTALL_PREFIX "${ROCM_PATH}" CACHE PATH "Path to install to when packaged.")
set(CMAKE_CXX_STANDARD 14)
# Get additional packages required
find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}")
include(ROCMSetupVersion)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
include(ROCMCheckTargetIds)
include(ROCMClients)
# Build variables
option(NO_MPI "Build RCCL-tests without MPI support.")
option(MPI_PATH "Use MPI in the specified directory.")
# Default GPU architectures to build
#==================================================================================================
set(DEFAULT_GPUS
gfx803
gfx900:xnack-
gfx906:xnack-
gfx908:xnack-
gfx90a:xnack-
gfx90a:xnack+
gfx940
gfx941
gfx942
gfx1030
gfx1100
gfx1101
gfx1102)
set(AMDGPU_TARGETS ${DEFAULT_GPUS} CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined.")
## Determine which GPU architectures to build for
if (COMMAND rocm_check_target_ids)
message(STATUS "Checking for ROCm support for GPU targets:")
rocm_check_target_ids(SUPPORTED_GPUS TARGETS "${AMDGPU_TARGETS}")
else()
message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs")
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif()
set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architectures to build for.")
message(STATUS "Compiling for ${GPU_TARGETS}")
find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
if (NOT NO_MPI)
# CHECK for MPI Path first. User requested this directory explicitely
if (MPI_PATH)
set(mpi_spec_bin_dir "${MPI_PATH}/bin")
set(mpi_spec_inc_dir "${MPI_PATH}/include")
check_mpi(mpicxx libmpi.a libmpi.so ${mpi_spec_bin_dir} ${MPI_PATH} ${mpi_spec_inc_dir})
if (NOT MPI_MPICXX)
# Since the user explicitely requested this directory, abort if something went wrong.
MESSAGE(FATAL_ERROR "Could not find MPI in ${MPI_PATH}")
endif()
endif()
# Check for MPICH Ubuntu installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx.mpich libmpich.a libmpich.so /usr/bin /usr /usr/include/x86_64-linux-gnu/mpich)
endif()
# Check for Open MPI Ubuntu installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx.openmpi libmpi.a libmpi.so /usr/bin /usr/lib/x86_64-linux-gnu/openmpi /usr/lib/x86_64-linux-gnu/openmpi/include)
endif()
# Check for MPICH RHEL installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpich/bin /usr/lib64/mpich /usr/include/mpich-x86_64)
endif()
# Check for Open MPI RHEL installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/openmpi/bin /usr/lib64/openmpi /usr/include/openmpi-x64_64)
endif()
# Check for MPICH SLES installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpich.a libmpich.so /usr/lib64/mpi/gcc/mpich/bin /usr/lib64/mpi/gcc/mpich /usr/lib64/mpi/gcc/mpich/include)
endif()
# Check for Open MPI v4 SLES installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi4/bin /usr/lib64/mpi/gcc/openmpi4 /usr/lib64/mpi/gcc/openmpi4/include)
endif()
# Check for Open MPI v3 SLES installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi3/bin /usr/lib64/mpi/gcc/openmpi3 /usr/lib64/mpi/gcc/openmpi3/include)
endif()
# Check for Open MPI v2 SLES installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi2/bin /usr/lib64/mpi/gcc/openmpi2 /usr/lib64/mpi/gcc/openmpi2/include)
endif()
if (NOT MPI_MPICXX)
message ("-- no MPI library found")
endif()
else()
message ("-- MPI support explicitely disabled")
endif()
set(ROCM_USE_DEV_COMPONENT OFF) # This repo doesn't have a dev component
# Add all of the tests
add_subdirectory(src)
# Create ROCm standard packages
rocm_create_package(
NAME rccl-tests
DESCRIPTION "Tests for the ROCm Communication Collectives Library"
MAINTAINER "RCCL Maintainer <rccl-maintainer@amd.com>"
)
Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
#
# See LICENCE.txt for license information
#
BUILDDIR ?= build
override BUILDDIR := $(abspath $(BUILDDIR))
.PHONY: all clean
default: src.build
TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
all: ${TARGETS:%=%.build}
clean: ${TARGETS:%=%.clean}
%.build:
${MAKE} -C $* build BUILDDIR=${BUILDDIR}
%.clean:
${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
Notices and Licenses file
_______________________________________________________________
Dependencies on nvidia-nccl-tests v2.0.0 (BSD3)
Copyright (c) 2016-2017, NVIDIA CORPORATION.
Modifications Copyright (c) 2019 Advanced Micro Devices, Inc.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
nvidia-nccl-tests v2.0.0 (BSD2)
Copyright (c) 2016-2017, NVIDIA CORPORATION. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National
Laboratory, the U.S. Department of Energy, nor the names of their
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
# RCCL Tests
These tests check both the performance and the correctness of RCCL operations. They can be compiled against [RCCL](https://github.com/ROCmSoftwarePlatform/rccl).
## Build
To build the tests, just type `make`.
If HIP is not installed in /opt/rocm, you may specify HIP\_HOME. Similarly, if RCCL is not installed in /usr, you may specify NCCL\_HOME and CUSTOM\_RCCL\_LIB.
```shell
$ make HIP_HOME=/path/to/hip NCCL_HOME=/path/to/rccl CUSTOM_RCCL_LIB=/path/to/rccl/lib/librccl.so
```
RCCL tests rely on MPI to work on multiple processes, hence multiple nodes. If you want to compile the tests with MPI support, you need to set MPI=1 and set MPI\_HOME to the path where MPI is installed.
```shell
$ make MPI=1 MPI_HOME=/path/to/mpi HIP_HOME=/path/to/hip RCCL_HOME=/path/to/rccl
```
RCCL tests can also be built using cmake. A typical sequence will be:
```shell
$ mkdir build
$ cd build
$ CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_PREFIX_PATH=/path/to/rccl ..
$ make
```
When using the cmake build procedure, please make sure that RCCL has also been built using cmake (i.e. not using the install.sh script), since cmake will check
for cmake target and config files that are created during the RCCL build.
Using the cmake method also has the advantage that the build is automatically checking for MPI installations, i.e. it is not necessary to explicitly request
MPI builds. A user can request to use a particular MPI library by using the MPI_PATH variable. MPI support can be explicitely disabled by adding the -DNO_MPI=1
flag to the cmake command line.
## Usage
RCCL tests can run on multiple processes, multiple threads, and multiple HIP devices per thread. The number of process is managed by MPI and is therefore not passed to the tests as argument. The total number of ranks (=HIP devices) will be equal to (number of processes)\*(number of threads)\*(number of GPUs per thread).
### Quick examples
Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
```shell
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
```
Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
```shell
$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
```
### Performance
See the [Performance](doc/PERFORMANCE.md) page for explanation about numbers, and in particular the "busbw" column.
### Arguments
All tests support the same set of arguments :
* Number of GPUs
* `-t,--nthreads <num threads>` number of threads per process. Default : 1.
* `-g,--ngpus <GPUs per thread>` number of gpus per thread. Default : 1.
* Sizes to scan
* `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
* `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
* Increments can be either fixed or a multiplication factor. Only one of those should be used
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
* `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
* RCCL operations arguments
* `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
* `-d,--datatype <nccltype/all>` Specify which datatype to use. Default : Float.
* `-r,--root <root/all>` Specify which root to use. Only for operations with a root like broadcast or reduce. Default : 0.
* `-y,--memory_type <coarse/fine/host/managed>` Default: Coarse
* `-s,--stress_cycles <number of cycles>` Default: 1
* `-u,--cumask <d0,d1,d2,d3>` Default: None
* Performance
* `-n,--iters <iteration count>` number of iterations. Default : 20.
* `-w,--warmup_iters <warmup iteration count>` number of warmup iterations (not timed). Default : 5.
* `-m,--agg_iters <aggregation count>` number of operations to aggregate together in each iteration. Default : 1.
* `-a,--average <0/1/2/3>` Report performance as an average across all ranks (MPI=1 only). <0=Rank0,1=Avg,2=Min,3=Max>. Default : 1.
* Test operation
* `-p,--parallel_init <0/1>` use threads to initialize RCCL in parallel. Default : 0.
* `-c,--check <0/1>` check correctness of results. This can be quite slow on large numbers of GPUs. Default : 1.
* `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
* `-G,--cudagraph <num graph launches>` Capture iterations as a CUDA graph and then replay specified number of times. Default : 0.
## Unit tests
Unit tests for rccl-tests are implemented with pytest (python3 is also required). Several notes for the unit tests:
1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests.
2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests which use fine-grained memory type.
The unit tests can be invoked within the rccl-tests root, or in the test subfolder. An example call to the unit tests:
```shell
$ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3 -m pytest
```
## Copyright
RCCL tests are provided under the BSD license.
All source code and accompanying documentation is copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
# Performance reported by RCCL tests
RCCL tests report the average operation time in ms, and two bandwidths in GB/s : algorithm bandwidth and bus bandwidth. This page explains what those numbers mean and what you should expect depending on the hardware used.
# Time
Time is useful with small sizes, to measure the constant overhead (or latency) associated with operations.
On large sizes, the time becomes linear with the size (since it is roughly equal to overhead + size / bw) and is no longer measuring the latency but
also the bandwidth multiplied by the size.
Therefore, on large sizes, it makes more sense to look at the bandwidth.
# Bandwidth
## Algorithm bandwidth
Algorithm bandwidth is using the most commonly used formula for bandwidth : size (_S_) / time (_t_). It is useful to compute how much time any large operation would take by simply dividing the size of the operation by the algorithm bandwidth.
`algbw = S/t`
## Bus bandwidth
While the algorithm bandwidth makes sense for point-to-point operations like Send/Receive, it is not always helpful to measure collective operations speed, since the theoretical peak algorithm bandwidth is not equal to the hardware peak bandwidth, usually depending on the number of ranks.
Most benchmarks only provide time measurements, which is hard to interpret for large sizes. Some others also provide algorithms bandwidth, but see that depending on the number of ranks, that bandwidth varies (and decreases as the number of ranks increase).
To provide a number which reflects how optimally the hardware is used, RCCL tests introduce the notion of "Bus Bandwidth" ("busbw" column in the tests output).
This number is obtained applying a formula to the algorithm bandwidth to reflect the speed of the inter-GPU communication.
Using this bus bandwidth, we can compare it with the hardware peak bandwidth, independently of the number of ranks used.
The formula depends on the collective operation.
### AllReduce
An allreduce operation, for each element of the N arrays (input i_X and output o_X, each situated on rank X), is performing the following operation :
`o_0 = o_1 = o_2 = ... = o_{n-1} = i_0 + i_1 + i_2 + ... + i_{n-1}`
**Note : this is independent of the algorithm used (ring, tree, or other) as long as they use point-to-point operations (send/receive).**
A ring would do that operation in an order which follows the ring :
`i_0 + i_1 + ... + i_{n-1} -> o_{n-1} -> o_0 -> o_1 -> .. -> o_{n-2}`
A tree would do it hierarchically :
`(((((i_{n-1} + i_{n-2}) + (i_{n-3} + i_{n-4})) + ... + (i_1 + i_0))))) -> o_0 -> (o_{n/2} -> (o_{3n/4} ...))`
In all cases, we need n-1 additions and n assignments for each element. Since every step is on a different rank except potentially one (the last input and the first output),
we need 2(n-1) data transfers (x number of elements) to perform an allReduce operation.
Considering that each rank has a bandwidth to the outside world of _B_, the time to perform an allReduce operation of _S_ elements is at best :
`t = (S*2*(n-1)) / (n*B)`
Indeed, we have _S_ elements, 2*(n-1) operations per element, and _n_ links of bandwidth _B_ to perform them.
Reordering the equation, we find that
`t = (S/B) * (2*(n-1)/n)`
Therefore, to get an AllReduce bandwidth measurement which we can compare to the hardware peak bandwidth, we compute :
`B = S/t * (2*(n-1)/n) = algbw * (2*(n-1)/n)`
### ReduceScatter
The ReduceScatter operation requires only to perform the addition part of the allReduce operation :
`o_K = i_0 + i_1 + i_2 + ... + i_{n-1}`
With K being the rank which is getting the final result(K=offset/recvsize).
The perfect reduceScatter time with a rank bandwidth of B would therefore be :
`t = S*(n-1) / (B*n)`
And the Bus Bandwidth is therefore computed as :
`B = S/t * (n-1)/n = algbw * (n-1)/n`
Note that here, S is the size in bytes of the total array, which for RCCL is equal to `recvcount*sizeof(datatype)*n` as the `recvcount` argument is the count per rank.
### AllGather
The AllGather operation requires only to perform the assignment part of the allReduce operation :
`o_0 = o_1 = o_2 = ... = o_{n-1} = i_K`
With K being the rank where the data originates from (K=offset*sendsize).
The perfect allGather time with a rank bandwidth of B would therefore be :
`t = S*(n-1) / (B*n)`
And the Bus Bandwidth is therefore computed as :
`B = S/t * (n-1)/n = algbw * (n-1)/n`
Note that here, S is the size in bytes of the total array, which for RCCL is equal to `sendcount*sizeof(datatype)*n` as the `sendcount` argument is the count per rank.
### Broadcast
The broadcast operation representation is similar to allGather :
`o_0 = o_1 = o_2 = ... = o_{n-1} = i_R`
R being the root of the operation.
However, in this case, since the i_R input is not evenly distributed on the ranks, we cannot use all N links to perform the transfer operations.
Indeed, *all* data has to get out of the root rank, hence the bottleneck is on the root rank which only has B as capacity to get data out :
`t = S/B`
And :
`B = S/t`
### Reduce
The reduce operation performs :
`o_R = i_0 + i_1 + i_2 + ... + i_{n-1}`
R being the root of the operation.
Similarly to broadcast, all data need to be sent to the root, hence :
`t = S/B`
And :
`B = S/t`
### Summary
To obtain a bus bandwidth which should be independent of the number of ranks _n_, we apply a correction factor to the algorithm bandwidth :
* AllReduce : 2*(_n_-1)/_n_
* ReduceScatter : (_n_-1)/_n_
* AllGather : (_n_-1)/_n_
* Broadcast : 1
* Reduce : 1
The bus bandwidth should reflect the speed of the hardware bottleneck : NVLink, PCI, QPI, or network.
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# This Dockerfile provides a starting point for a ROCm installation of rccl
# Parameters related to building rccl
ARG base_image
FROM ${base_image}
LABEL maintainer="rccl-maintainer@amd.com"
USER root
ARG user_uid
# Install dependent packages
RUN yum install -y --nogpgcheck \
sudo \
chrpath \
rock-dkms \
rocm-cmake \
centos-release-scl \
devtoolset-7 \
ca-certificates \
git \
cmake3 \
make \
libgomp \
clang \
clang-devel \
gcc-c++ \
pkgconfig \
numactl-libs
RUN echo '#!/bin/bash' | tee /etc/profile.d/devtoolset7.sh && echo \
'source scl_source enable devtoolset-7' >>/etc/profile.d/devtoolset7.sh
# docker pipeline runs containers with particular uid
# create a jenkins user with this specific uid so it can use sudo priviledges
# Grant any member of sudo group password-less sudo privileges
RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd && \
chmod 400 /etc/sudoers.d/sudo-nopasswd
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# Parameters related to building rccl
ARG base_image
FROM ${base_image}
LABEL maintainer="rccl-maintainer@amd.com"
ARG user_uid
# Install dependent packages
# Dependencies:
# * hcc-config.cmake: pkg-config
# * tensile: python2.7, python-yaml
# * rocblas-test: gfortran, googletest
# * rocblas-bench: libboost-program-options-dev
# * libhsakmt.so: libnuma1
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
rock-dkms \
sudo \
ca-certificates \
chrpath \
git \
make \
cmake \
pkg-config \
python2.7 \
python-yaml \
python3-pytest \
rocm-cmake \
libboost-program-options-dev \
libnuma1 \
libomp-dev \
&& \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# docker pipeline runs containers with particular uid
# create a jenkins user with this specific uid so it can use sudo priviledges
# Grant any member of sudo group password-less sudo privileges
RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \
mkdir -p /etc/sudoers.d/ && \
echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# Parameters related to building rccl
ARG base_image
FROM ${base_image}
LABEL maintainer="rccl-maintainer@amd.com"
#empty for now
# Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
# Parameters related to building rccl
ARG base_image
FROM ${base_image}
LABEL maintainer="rccl-maintainer@amd.com"
#empty for now
#!/bin/bash
# Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
# #################################################
# helper functions
# #################################################
function display_help()
{
echo "RCCL-tests build & installation helper script"
echo "./install [-h|--help] "
echo " [-h|--help] Prints this help message."
echo " [-m|--mpi] Build RCCL-tests with MPI support. (see --mpi_home below.)"
echo " [--rccl_home] Specify custom path for RCCL installation (default: /opt/rocm/rccl)"
echo " [--mpi_home] Specify path to your MPI installation."
}
# #################################################
# global variables
# #################################################
run_tests=false
build_release=true
mpi_enabled=false
rccl_dir=/opt/rocm/rccl
mpi_dir=""
# #################################################
# Parameter parsing
# #################################################
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --longoptions help,mpi,test,rccl_home:,mpi_home: --options hmt -- "$@")
else
echo "Need a new version of getopt"
exit 1
fi
if [[ $? -ne 0 ]]; then
echo "getopt invocation failed; could not parse the command line";
exit 1
fi
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
-h|--help)
display_help
exit 0
;;
-m|--mpi)
mpi_enabled=true
shift ;;
-t|--test)
run_tests=true
shift ;;
--rccl_home)
rccl_dir=${2}
shift 2 ;;
--mpi_home)
mpi_dir=${2}
shift 2 ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
# throw error code after running a command in the install script
check_exit_code( )
{
if (( $1 != 0 )); then
exit $1
fi
}
# Install the pre-commit hook
#bash ./githooks/install
build_dir=./build
# #################################################
# prep
# #################################################
# ensure a clean build environment
rm -rf ${build_dir}
if ($mpi_enabled); then
if [[ ${mpi_dir} == "" ]]; then
echo "MPI flag enabled but path to MPI installation not specified. See --mpi_home command line argument."
exit 1
else
make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so MPI=1 MPI_HOME=${mpi_dir} -j$(nproc)
fi
else
make NCCL_HOME=${rccl_dir} CUSTOM_RCCL_LIB=${rccl_dir}/lib/librccl.so -j$(nproc)
fi
check_exit_code "$?"
# Optionally, run tests if they're enabled.
if ($run_tests); then
if ($mpi_enabled); then
cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib:${mpi_dir}/lib PATH=$PATH:${mpi_dir}/bin python3 -m pytest
else
cd test; LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${rccl_dir}/lib python3 -m pytest
fi
fi
# ########################################################################
# Copyright 2022 Advanced Micro Devices, Inc.
# ########################################################################
# Compile common object library
set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
target_link_libraries(rccl_common roc::rccl hip::device)
if(USE_MPI)
target_link_libraries(rccl_common MPI::MPI_CXX)
endif()
function(add_relative_test test_name test_target)
get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
set(EXE_PATH ".")
endif()
get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
set(EXE_NAME "${test_target}")
endif()
endif()
file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
add_test(NAME "${test_name}" COMMAND "./${rel_path}")
endfunction()
function(add_rccl_test TEST)
set(TEST_SOURCE "${TEST}.cu")
set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
set(TEST_TARGET "${TEST}_perf")
add_executable(${TEST_TARGET} ${TEST_SOURCE})
target_link_libraries(
${TEST_TARGET}
PRIVATE
rccl_common
)
set_target_properties(
${TEST_TARGET}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
# LINKER_LANGUAGE CXX
)
add_relative_test(${TEST} ${TEST_TARGET})
rocm_install(TARGETS ${TEST_TARGET})
# TODO: copy/install DLLs on Windows
set_target_properties(
${TEST_TARGET} PROPERTIES
INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib"
)
endfunction()
add_rccl_test(all_gather)
add_rccl_test(all_reduce)
add_rccl_test(alltoall)
add_rccl_test(alltoallv)
add_rccl_test(broadcast)
add_rccl_test(gather)
add_rccl_test(hypercube)
add_rccl_test(reduce_scatter)
add_rccl_test(reduce)
add_rccl_test(scatter)
add_rccl_test(sendrecv)
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
ROCM_PATH ?= /opt/rocm
MPI_HOME ?= /usr/lib/openmpi
PREFIX ?= /usr/local
VERBOSE ?= 0
DEBUG ?= 0
NCCL_HOME ?= ""
HIPCC = $(ROCM_PATH)/bin/hipcc
CXX = $(HIPCC)
HIPCUFLAGS := -std=c++14
LDFLAGS :=
HIPLDFLAGS :=
ifneq ($(NCCL_HOME), "")
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
endif
HIPCUFLAGS += -I$(ROCM_PATH)/include
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
ifeq ($(DEBUG), 0)
HIPCUFLAGS += -O3
else
HIPCUFLAGS += -O0 -g -ggdb3
endif
ifeq ($(VERBOSE), 0)
.SILENT:
endif
.PHONY: build clean
BUILDDIR ?= ../build
ifeq ($(MPI), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
else ifeq ($(MPICH), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
HIPLDFLAGS += -L/usr/lib -lmpich
endif
LIBRARIES += rccl
HIPLDFLAGS += $(LIBRARIES:%=-l%)
DST_DIR := $(BUILDDIR)
SRC_FILES := $(wildcard *.cu)
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
build: ${BIN_FILES}
clean:
rm -rf ${DST_DIR}
TEST_VERIFIABLE_SRCDIR := ../verifiable
TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
include ../verifiable/verifiable.mk
${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
${DST_DIR}/timer.o: timer.cc timer.h
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
@printf "Linking %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
#define ALIGN 4
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base;
*recvcount = base*nranks;
*sendInplaceOffset = base;
*recvInplaceOffset = 0;
*paramcount = base;
}
testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks - 1))/((double)nranks);
*busBw = baseBw * factor;
}
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
struct testColl allGatherTest = {
"AllGather",
AllGatherGetCollByteCount,
AllGatherInitData,
AllGatherGetBw,
AllGatherRunColl
};
void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &allGatherTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
AllGatherGetBuffSize,
AllGatherRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
}
testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k = 0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(2*(nranks - 1)))/((double)nranks);
*busBw = baseBw * factor;
}
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
struct testColl allReduceTest = {
"AllReduce",
AllReduceGetCollByteCount,
AllReduceInitData,
AllReduceGetBw,
AllReduceRunColl
};
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &allReduceTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if ((int)op != -1) {
op_count = 1;
run_ops = &op;
run_opnames = &opName;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}
for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
AllReduceGetBuffSize,
AllReduceRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = (count/nranks)*nranks;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = count/nranks;
}
testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
size_t partcount = sendcount/nranks;
TESTCHECK(InitData(((char*)args->expected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
// We don't support in-place alltoall
args->reportErrors = in_place ? 0 : 1;
return testSuccess;
}
void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks-1))/((double)(nranks));
*busBw = baseBw * factor;
}
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
struct testColl alltoAllTest = {
"AlltoAll",
AlltoAllGetCollByteCount,
AlltoAllInitData,
AlltoAllGetBw,
AlltoAllRunColl
};
void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &alltoAllTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
AlltoAllGetBuffSize,
AlltoAllRunTest
};
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
#define USE_RCCL_GATHER_SCATTER
void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
if (count < nranks*nranks/2) {
*sendcount = 0;
*recvcount = 0;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = 0;
} else {
*sendcount = (count/nranks)*nranks;
*recvcount = (count/nranks)*nranks;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = count/nranks;
}
}
testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
#if 0
int *dataHost = (int *)malloc(args->sendBytes);
hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
printf(" Rank [%d] Original: ", rank);
for(int j=0; j<sendcount; j++) {
printf("%d:%d ", j, dataHost[j]);
}
printf("\n");
free(dataHost);
#endif
size_t rdisp = 0;
size_t data_count = sendcount*2/nranks;
size_t chunksize = data_count/nranks;
for (int j=0; j<nranks; j++) {
size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
if ((j+rank)%nranks == 0)
rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
size_t sdisp = 0;
for (int kk=0; kk<nranks; kk++) {
scount = ((kk+j)%nranks)*chunksize;
if ((kk+j)%nranks == 0)
scount += (sendcount-chunksize*(nranks-1)*nranks/2);
if (kk == rank)
break;
sdisp += scount;
}
TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
rdisp += rcount;
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
// We don't support in-place alltoall
args->reportErrors = in_place ? 0 : 1;
return testSuccess;
}
void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks-1))/((double)(nranks));
*busBw = baseBw * factor;
}
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int nranks;
NCCLCHECK(ncclCommCount(comm, &nranks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
if (count == 0) return testSuccess;
size_t *sendcounts, *recvcounts, *sdispls, *rdispls;
sendcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
recvcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
sdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
rdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
if (sendcounts == nullptr || recvcounts == nullptr || sdispls == nullptr || rdispls == nullptr) {
printf("failed to allocate buffers for alltoallv\n");
return testNcclError;
}
size_t disp = 0;
size_t chunksize = count*2/nranks;
for (int i = 0; i < nranks; i++) {
size_t scount = ((i+rank)%nranks)*chunksize;
if ((i+rank)%nranks == 0)
scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
sendcounts[i+rank*nranks] = recvcounts[i+rank*nranks] = scount;
sdispls[i+rank*nranks] = rdispls[i+rank*nranks] = disp;
disp += scount;
//printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*nranks]*wordSize(type), sdispls[i+rank*nranks]*wordSize(type));
}
#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
return testNcclError;
#else
#if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER)
NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*nranks, sdispls+rank*nranks, recvbuff, recvcounts+rank*nranks, rdispls+rank*nranks, type, comm, stream));
#else
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nranks; r++) {
if (sendcounts[r+rank*nranks] != 0) {
NCCLCHECK(ncclSend(
((char*)sendbuff) + sdispls[r+rank*nranks] * wordSize(type),
sendcounts[r+rank*nranks],
type,
r,
comm,
stream));
}
if (recvcounts[r+rank*nranks] != 0) {
NCCLCHECK(ncclRecv(
((char*)recvbuff) + rdispls[r+rank*nranks] * wordSize(type),
recvcounts[r+rank*nranks],
type,
r,
comm,
stream));
}
}
NCCLCHECK(ncclGroupEnd());
#endif
#endif
free(sendcounts);
free(recvcounts);
free(sdispls);
free(rdispls);
return testSuccess;
}
struct testColl alltoAllTest = {
"AlltoAllv",
AlltoAllvGetCollByteCount,
AlltoAllvInitData,
AlltoAllvGetBw,
AlltoAllvRunColl
};
void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &alltoAllTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = ncclNumTypes;
run_types = test_types;
run_typenames = test_typenames;
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
AlltoAllvGetBuffSize,
AlltoAllvRunTest
};
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
}
testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = 1;
*busBw = baseBw * factor;
}
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream));
#else
if (rank == root) {
NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream));
} else {
NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream));
}
#endif
return testSuccess;
}
struct testColl broadcastTest = {
"Broadcast",
BroadcastGetCollByteCount,
BroadcastInitData,
BroadcastGetBw,
BroadcastRunColl
};
void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &broadcastTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
int begin_root, end_root;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 0;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
BroadcastGetBuffSize,
BroadcastRunTest
};
This diff is collapsed.
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef __COMMON_H__
#define __COMMON_H__
#include "rccl/rccl.h"
#include <stdio.h>
#include <cstdint>
#include <algorithm>
#ifdef MPI_SUPPORT
#include "mpi.h"
#endif
#include <pthread.h>
#include "nccl1_compat.h"
#include "timer.h"
// For nccl.h < 2.13 since we define a weak fallback
extern "C" char const* ncclGetLastError(ncclComm_t comm);
#define HIPCHECK(cmd) do { \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test HIP failure %s:%d '%s'\n", \
hostname, \
__FILE__,__LINE__,hipGetErrorString(e)); \
return testCudaError; \
} \
} while(0)
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
#define NCCLCHECK(cmd) do { \
ncclResult_t res = cmd; \
if (res != ncclSuccess) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test NCCL failure %s:%d " \
"'%s / %s'\n", \
hostname,__FILE__,__LINE__, \
ncclGetErrorString(res), \
ncclGetLastError(NULL)); \
return testNcclError; \
} \
} while(0)
#else
#define NCCLCHECK(cmd) do { \
ncclResult_t res = cmd; \
if (res != ncclSuccess) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test NCCL failure %s:%d '%s'\n", \
hostname, \
__FILE__,__LINE__,ncclGetErrorString(res)); \
return testNcclError; \
} \
} while(0)
#endif
typedef enum {
testSuccess = 0,
testInternalError = 1,
testCudaError = 2,
testNcclError = 3,
testTimeout = 4,
testNumResults = 5
} testResult_t;
// Relay errors up and trace
#define TESTCHECK(cmd) do { \
testResult_t r = cmd; \
if (r!= testSuccess) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf(" .. %s pid %d: Test failure %s:%d\n", \
hostname, getpid(), \
__FILE__,__LINE__); \
return r; \
} \
} while(0)
struct testColl {
const char name[20];
void (*getCollByteCount)(
size_t *sendcount, size_t *recvcount, size_t *paramcount,
size_t *sendInplaceOffset, size_t *recvInplaceOffset,
size_t count, int nranks);
testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
ncclRedOp_t op, int root, int rep, int in_place);
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
};
extern struct testColl allReduceTest;
extern struct testColl allGatherTest;
extern struct testColl reduceScatterTest;
extern struct testColl broadcastTest;
extern struct testColl reduceTest;
extern struct testColl alltoAllTest;
struct testEngine {
void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
const char* typeName, ncclRedOp_t op, const char* opName);
};
extern struct testEngine ncclTestEngine;
struct threadArgs {
size_t nbytes;
size_t minbytes;
size_t maxbytes;
size_t stepbytes;
size_t stepfactor;
int totalProcs;
int nProcs;
int proc;
int nThreads;
int thread;
int nGpus;
int* gpus;
int localRank;
int localNumDevices;
int enable_multiranks;
int enable_out_of_place;
int nRanks;
void** sendbuffs;
size_t sendBytes;
size_t sendInplaceOffset;
void** recvbuffs;
size_t recvInplaceOffset;
ncclUniqueId ncclId;
ncclComm_t* comms;
hipStream_t* streams;
void** expected;
size_t expectedBytes;
int* errors;
double* bw;
int* bw_count;
int reportErrors;
struct testColl* collTest;
};
typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
struct testThread {
pthread_t thread;
threadFunc_t func;
struct threadArgs args;
testResult_t ret;
};
// Provided by common.cu
extern void Barrier(struct threadArgs* args);
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
#include <unistd.h>
static void getHostName(char* hostname, int maxlen) {
gethostname(hostname, maxlen);
for (int i=0; i< maxlen; i++) {
if (hostname[i] == '.') {
hostname[i] = '\0';
return;
}
}
}
#include <stdint.h>
static uint64_t getHash(const char* string, size_t n) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (size_t c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
}
return result;
}
/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
*
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
*
*/
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
static uint64_t getHostHash(const char* hostname) {
char hostHash[1024];
// Fall back is the hostname if something fails
(void) strncpy(hostHash, hostname, sizeof(hostHash));
int offset = strlen(hostHash);
FILE *file = fopen(HOSTID_FILE, "r");
if (file != NULL) {
char *p;
if (fscanf(file, "%ms", &p) == 1) {
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
free(p);
}
}
fclose(file);
// Make sure the string is terminated
hostHash[sizeof(hostHash)-1]='\0';
return getHash(hostHash, strlen(hostHash));
}
static size_t wordSize(ncclDataType_t type) {
switch(type) {
case ncclChar:
#if NCCL_MAJOR >= 2
//case ncclInt8:
case ncclUint8:
#endif
return 1;
case ncclHalf:
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
case ncclBfloat16:
#endif
//case ncclFloat16:
return 2;
case ncclInt:
case ncclFloat:
#if NCCL_MAJOR >= 2
//case ncclInt32:
case ncclUint32:
//case ncclFloat32:
#endif
return 4;
case ncclInt64:
case ncclUint64:
case ncclDouble:
//case ncclFloat64:
return 8;
default: return 0;
}
}
extern int test_ncclVersion; // init'd with ncclGetVersion()
typedef enum { ncclCoarse = 0,
ncclFine = 1,
ncclHost = 2,
ncclManaged = 3,
nccl_NUM_MTYPES = 4 } ncclMemoryType_t;
extern const char *test_memorytypes[nccl_NUM_MTYPES];
constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
extern int test_opnum;
extern int test_typenum;
extern ncclDataType_t test_types[ncclNumTypes];
extern const char *test_typenames[ncclNumTypes];
extern ncclRedOp_t test_ops[];
extern const char *test_opnames[];
static int ncclstringtotype(char *str) {
for (int t=0; t<ncclNumTypes; t++) {
if (strcmp(str, test_typenames[t]) == 0) {
return t;
}
}
if (strcmp(str, "all") == 0) {
return -1;
}
printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
return ncclFloat;
}
static int ncclstringtoop (char *str) {
for (int o=0; o<test_opnum; o++) {
if (strcmp(str, test_opnames[o]) == 0) {
return o;
}
}
if (strcmp(str, "all") == 0) {
return -1;
}
printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
return ncclSum;
}
static int ncclstringtomtype (char *str) {
for (int o=0; o<nccl_NUM_MTYPES; o++) {
if (strcmp(str, test_memorytypes[o]) == 0) {
return o;
}
}
printf("invalid memorytype %s, defaulting to %s .. \n", str, test_memorytypes[ncclCoarse]);
return ncclCoarse;
}
extern int is_main_proc;
extern thread_local int is_main_thread;
#define PRINT if (is_main_thread) printf
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment