Commit 7dc4e964 authored by wanghan's avatar wanghan
Browse files

Initial commit: RCCL auto-tuning project

parents
#!/bin/bash
# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
# #################################################
# global variables
# #################################################
ROCM_PATH=${ROCM_PATH:="/opt/rocm"}
# Default values
build_address_sanitizer=false
build_allreduce_only=false
build_bfd=false
build_freorg_bkwdcomp=false
build_local_gpu_only=false
build_package=false
build_release=true
build_static=false
build_tests=false
build_verbose=0
clean_build=true
collective_trace=true
enable_ninja=""
install_dependencies=false
install_library=false
msccl_kernel_enabled=true
num_parallel_jobs=16
npkit_enabled=false
run_tests=false
run_tests_all=false
time_trace=false
# #################################################
# helper functions
# #################################################
function display_help()
{
echo "RCCL build & installation helper script"
echo " Options:"
echo " --address-sanitizer Build with address sanitizer enabled"
echo " --build_allreduce_only Build only AllReduce + sum + float kernel"
echo " -d|--dependencies Install RCCL depdencencies"
echo " --debug Build debug library"
echo " --enable_backtrace Build with custom backtrace support"
echo " --disable-colltrace Build without collective trace"
echo " --disable-msccl-kernel Build without MSCCL kernels"
echo " -f|--fast Quick-build RCCL (local gpu arch only, no backtrace, and collective trace support)"
echo " -h|--help Prints this help message"
echo " -i|--install Install RCCL library (see --prefix argument below)"
echo " -j|--jobs Specify how many parallel compilation jobs to run ($num_parallel_jobs by default)"
echo " -l|--local_gpu_only Only compile for local GPU architecture"
echo " --no_clean Don't delete files if they already exist"
echo " --npkit-enable Compile with npkit enabled"
echo " -p|--package_build Build RCCL package"
echo " --prefix Specify custom directory to install RCCL to (default: /opt/rocm)"
echo " --rm-legacy-include-dir Remove legacy include dir Packaging added for file/folder reorg backward compatibility"
echo " --run_tests_all Run all rccl unit tests (must be built already)"
echo " -r|--run_tests_quick Run small subset of rccl unit tests (must be built already)"
echo " --static Build RCCL as a static library instead of shared library"
echo " -t|--tests_build Build rccl unit tests, but do not run"
echo " --time-trace Plot the build time of RCCL"
echo " --verbose Show compile commands"
}
# #################################################
# Parameter parsing
# #################################################
# check if we have a modern version of getopt that can handle whitespace and long parameters
getopt -T
if [[ $? -eq 4 ]]; then
GETOPT_PARSE=$(getopt --name "${0}" --options dfhij:lprt --longoptions address-sanitizer,build_allreduce_only,dependencies,debug,enable_backtrace,disable-colltrace,disable-msccl-kernel,fast,help,install,jobs:,local_gpu_only,no_clean,npkit-enable,package_build,prefix:,rm-legacy-include-dir,run_tests_all,run_tests_quick,static,tests_build,time-trace,verbose -- "$@")
else
echo "Need a new version of getopt"
exit 1
fi
if [[ $? -ne 0 ]]; then
echo "getopt invocation failed; could not parse the command line";
exit 1
fi
eval set -- "${GETOPT_PARSE}"
while true; do
case "${1}" in
--address-sanitizer) build_address_sanitizer=true; shift ;;
--build_allreduce_only) build_allreduce_only=true; shift ;;
-d | --dependencies) install_dependencies=true; shift ;;
--debug) build_release=false; shift ;;
--enable_backtrace) build_bfd=true; shift ;;
--disable-colltrace) collective_trace=false; shift ;;
--disable-msccl-kernel) msccl_kernel_enabled=false; shift ;;
-f | --fast) build_local_gpu_only=true; collective_trace=false; msccl_kernel_enabled=false; shift ;;
-h | --help) display_help; exit 0 ;;
-i | --install) install_library=true; shift ;;
-j | --jobs) num_parallel_jobs=${2}; shift 2 ;;
-l | --local_gpu_only) build_local_gpu_only=true; shift ;;
--no_clean) clean_build=false; shift ;;
--npkit-enable) npkit_enabled=true; shift ;;
-p | --package_build) build_package=true; shift ;;
--prefix) install_prefix=${2}; shift 2 ;;
--rm-legacy-include-dir) build_freorg_bkwdcomp=false; shift ;;
-r | --run_tests_quick) run_tests=true; shift ;;
--run_tests_all) run_tests=true; run_tests_all=true; shift ;;
--static) build_static=true; shift ;;
-t | --tests_build) build_tests=true; shift ;;
--time-trace) time_trace=true; shift ;;
--verbose) build_verbose=1; shift ;;
--) shift ; break ;;
*) echo "Unexpected command line parameter received; aborting";
exit 1
;;
esac
done
ROCM_BIN_PATH=$ROCM_PATH/bin
# /etc/*-release files describe the system
if [[ -e "/etc/os-release" ]]; then
source /etc/os-release
elif [[ -e "/etc/centos-release" ]]; then
OS_ID=$(cat /etc/centos-release | awk '{print tolower($1)}')
VERSION_ID=$(cat /etc/centos-release | grep -oP '(?<=release )[^ ]*' | cut -d "." -f1)
else
echo "This script depends on the /etc/*-release files"
exit 2
fi
# throw error code after running a command in the install script
check_exit_code( )
{
if (( $1 != 0 )); then
exit $1
fi
}
if [[ "$build_release" == true ]]; then
unit_test_path="./build/release/test/rccl-UnitTests"
else
unit_test_path="./build/debug/test/rccl-UnitTests"
fi
if ($run_tests) && [[ -f $unit_test_path ]]; then
if [[ "$build_tests" == false ]]; then
clean_build=false
fi
fi
# #################################################
# prep
# #################################################
# ensure a clean build environment
if ($clean_build); then
if [[ "${build_release}" == true ]]; then
rm -rf build/release
else
rm -rf build/debug
fi
fi
# Create and go to the build directory.
mkdir -p build; cd build
if ($build_release); then
mkdir -p release; cd release
else
mkdir -p debug; cd debug
fi
# build type
if [[ "${build_release}" == true ]]; then
cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Release"
else
cmake_common_options="${cmake_common_options} -DCMAKE_BUILD_TYPE=Debug"
fi
# Address sanitizer
if [[ "${build_address_sanitizer}" == true ]]; then
cmake_common_options="${cmake_common_options} -DBUILD_ADDRESS_SANITIZER=ON"
fi
# AllReduce only
if [[ "${build_allreduce_only}" == true ]]; then
cmake_common_options="${cmake_common_options} -DBUILD_ALLREDUCE_ONLY=ON"
fi
# Backtrace support
if [[ "${build_bfd}" == true ]]; then
cmake_common_options="${cmake_common_options} -DBUILD_BFD=ON"
fi
# Backward compatibility wrappers
if [[ "${build_freorg_bkwdcomp}" == true ]]; then
cmake_common_options="${cmake_common_options} -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=ON"
else
cmake_common_options="${cmake_common_options} -DBUILD_FILE_REORG_BACKWARD_COMPATIBILITY=OFF"
fi
# Build local GPU arch only
if [[ "$build_local_gpu_only" == true ]]; then
cmake_common_options="${cmake_common_options} -DBUILD_LOCAL_GPU_TARGET_ONLY=ON"
fi
# shared vs static
if [[ "${build_static}" == true ]]; then
cmake_common_options="${cmake_common_options} -DBUILD_SHARED_LIBS=OFF"
fi
# Disable collective trace
if [[ "${collective_trace}" == false ]]; then
cmake_common_options="${cmake_common_options} -DCOLLTRACE=OFF"
fi
if [[ "${msccl_kernel_enabled}" == false ]]; then
cmake_common_options="${cmake_common_options} -DENABLE_MSCCL_KERNEL=OFF"
fi
# Install dependencies
if ($install_dependencies); then
cmake_common_options="${cmake_common_options} -DINSTALL_DEPENDENCIES=ON"
fi
cmake_executable=cmake
case "${OS_ID}" in
centos|rhel)
cmake_executable=cmake3
;;
esac
npkit_options=""
if ($npkit_enabled); then
npkit_options="-DENABLE_NPKIT \
-DENABLE_NPKIT_EVENT_TIME_SYNC_GPU \
-DENABLE_NPKIT_EVENT_TIME_SYNC_CPU \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_EXIT \
-DENABLE_NPKIT_EVENT_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_DIRECT_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_DIRECT_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_DIRECT_RECV_EXIT \
-DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_DIRECT_RECV_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_DIRECT_RECV_REDUCE_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_DIRECT_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_DIRECT_SEND_EXIT \
-DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_ENTRY \
-DENABLE_NPKIT_EVENT_DIRECT_SEND_FROM_OUTPUT_EXIT \
-DENABLE_NPKIT_EVENT_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_RECV_EXIT \
-DENABLE_NPKIT_EVENT_RECV_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_RECV_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_ENTRY \
-DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_EXIT \
-DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_RECV_REDUCE_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_RECV_REDUCE_SEND_EXIT \
-DENABLE_NPKIT_EVENT_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_SEND_EXIT \
-DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_ENTRY \
-DENABLE_NPKIT_EVENT_SEND_FROM_OUTPUT_EXIT \
-DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_ENTRY \
-DENABLE_NPKIT_EVENT_PRIM_SIMPLE_WAIT_PEER_EXIT \
-DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_ENTRY \
-DENABLE_NPKIT_EVENT_PRIM_SIMPLE_REDUCE_OR_COPY_MULTI_EXIT \
-DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_PRIM_LL_WAIT_SEND_EXIT \
-DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_ENTRY \
-DENABLE_NPKIT_EVENT_PRIM_LL_DATA_PROCESS_EXIT \
-DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_PRIM_LL128_WAIT_SEND_EXIT \
-DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_ENTRY \
-DENABLE_NPKIT_EVENT_PRIM_LL128_DATA_PROCESS_EXIT \
-DENABLE_NPKIT_EVENT_NET_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_NET_SEND_EXIT \
-DENABLE_NPKIT_EVENT_NET_TEST_ENTRY \
-DENABLE_NPKIT_EVENT_NET_TEST_EXIT \
-DENABLE_NPKIT_EVENT_NET_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_NET_RECV_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_SEND_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_RECV_REDUCE_SEND_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_REDUCE_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_RING_DIRECT_RECV_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_REDUCE_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_UPDOWN_BROADCAST_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_BROADCAST_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_REDUCE_EXIT \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_REDUCE_TREE_SPLIT_BROADCAST_EXIT \
-DENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_ENTRY \
-DENABLE_NPKIT_EVENT_SEND_RECV_LOCAL_COPY_EXIT \
-DENABLE_NPKIT_EVENT_SEND_RECV_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_SEND_RECV_SEND_EXIT \
-DENABLE_NPKIT_EVENT_SEND_RECV_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_SEND_RECV_RECV_EXIT \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_EXIT \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_SEND_EXIT \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_RECV_COPY_SEND_EXIT \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_ALL_GATHER_RING_DIRECT_RECV_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_GENERIC_OP_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_REDUCE_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_REDUCE_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_SEND_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_SEND_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_RECV_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_RECV_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_RUN_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_RUN_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_RECV_REDUCE_COPY_EXIT \
-DENABLE_NPKIT_EVENT_MSCCL_INIT_ENTRY \
-DENABLE_NPKIT_EVENT_MSCCL_INIT_EXIT \
-DENABLE_NPKIT_PRIM_COLLECT_DATA_PROCESS_TIME"
fi
check_exit_code "$?"
if ($time_trace); then
build_system="ninja"
enable_ninja="-GNinja"
else
build_system="make"
fi
if ($build_tests) || (($run_tests) && [[ ! -f ./test/rccl-UnitTests ]]); then
CXX=$ROCM_BIN_PATH/hipcc $cmake_executable $cmake_common_options -DBUILD_TESTS=ON -DNPKIT_FLAGS="${npkit_options}" -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH $enable_ninja ../../.
else
CXX=$ROCM_BIN_PATH/hipcc $cmake_executable $cmake_common_options -DBUILD_TESTS=OFF -DNPKIT_FLAGS="${npkit_options}" -DCMAKE_INSTALL_PREFIX=$ROCM_PATH -DROCM_PATH=$ROCM_PATH $enable_ninja ../../.
fi
check_exit_code "$?"
if ($install_library); then
VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs install
else
VERBOSE=${build_verbose} $build_system -j $num_parallel_jobs
fi
check_exit_code "$?"
if ($build_package); then
make package
check_exit_code "$?"
fi
# Optionally, run tests if they're enabled.
if ($run_tests); then
if (test -f "./test/rccl-UnitTests"); then
if ($run_tests_all); then
./test/rccl-UnitTests
else
./test/rccl-UnitTests --gtest_filter="AllReduce.*"
fi
else
echo "rccl unit tests have not been built yet; please re-run script with -t to build rccl unit tests."
exit 1
fi
fi
if ($time_trace); then
search_dir="../../"
time_trace_dir=$(find "$search_dir" -type d -name "time-trace" -print -quit)
if [ "$time_trace_dir" ]; then
time_trace_script="$time_trace_dir/rccl-TimeTrace.sh"
if [ -x "$time_trace_script" ]; then
echo "Generating RCCL-compile-timeline.html..."
(cd "$time_trace_dir" && ./rccl-TimeTrace.sh)
else
echo "Error: Unable to execute $time_trace_script. Make sure the file has the correct permissions."
fi
else
echo "Error: time-trace folder not found in $search_dir."
fi
fi
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
CUDA_HOME ?= /usr/local/cuda
PREFIX ?= /usr/local
VERBOSE ?= 0
KEEP ?= 0
DEBUG ?= 0
TRACE ?= 0
PROFAPI ?= 1
NVTX ?= 1
RDMA_CORE ?= 0
NVCC = $(CUDA_HOME)/bin/nvcc
CUDA_LIB ?= $(CUDA_HOME)/lib64
CUDA_INC ?= $(CUDA_HOME)/include
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
#CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev)
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
#$(info CUDA_VERSION ${CUDA_MAJOR}.${CUDA_MINOR})
# You should define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
CUDA8_GENCODE = -gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61
ifeq ($(shell test "0$(CUDA_MAJOR)" -lt 12; echo $$?),0)
# SM35 is deprecated from CUDA12.0 onwards
CUDA8_GENCODE += -gencode=arch=compute_35,code=sm_35
endif
CUDA9_GENCODE = -gencode=arch=compute_70,code=sm_70
CUDA11_GENCODE = -gencode=arch=compute_80,code=sm_80
CUDA12_GENCODE = -gencode=arch=compute_90,code=sm_90
CUDA8_PTX = -gencode=arch=compute_61,code=compute_61
CUDA9_PTX = -gencode=arch=compute_70,code=compute_70
CUDA11_PTX = -gencode=arch=compute_80,code=compute_80
CUDA12_PTX = -gencode=arch=compute_90,code=compute_90
ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 11 -a "0$(CUDA_MINOR)" -ge 8 -o "0$(CUDA_MAJOR)" -gt 11; echo $$?),0)
# Include Hopper support if we're using CUDA11.8 or above
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA12_GENCODE) $(CUDA12_PTX)
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA11_GENCODE) $(CUDA11_PTX)
# Include Volta support if we're using CUDA9 or above
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 9; echo $$?),0)
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX)
else
NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX)
endif
$(info NVCC_GENCODE is ${NVCC_GENCODE})
CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden \
-Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla \
-I $(CUDA_INC) \
$(CXXFLAGS)
# Maxrregcount needs to be set accordingly to NCCL_MAX_NTHREADS (otherwise it will cause kernel launch errors)
# 512 : 120, 640 : 96, 768 : 80, 1024 : 60
# We would not have to set this if we used __launch_bounds__, but this only works on kernels, not on functions.
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 --expt-extended-lambda -Xptxas -maxrregcount=96 -Xfatbin -compress-all
# Use addprefix so that we can specify more than one path
NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt
########## GCOV ##########
GCOV ?= 0 # disable by default.
GCOV_FLAGS := $(if $(filter 0,${GCOV} ${DEBUG}),,--coverage) # only gcov=1 and debug =1
CXXFLAGS += ${GCOV_FLAGS}
NVCUFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
LDFLAGS += ${GCOV_FLAGS}
NVLDFLAGS += ${GCOV_FLAGS:%=-Xcompiler %}
# $(warning GCOV_FLAGS=${GCOV_FLAGS})
########## GCOV ##########
ifeq ($(DEBUG), 0)
NVCUFLAGS += -O3
CXXFLAGS += -O3 -g
else
NVCUFLAGS += -O0 -G -g
CXXFLAGS += -O0 -g -ggdb3
endif
ifneq ($(VERBOSE), 0)
NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
CXXFLAGS += -Wall -Wextra
else
.SILENT:
endif
ifneq ($(TRACE), 0)
CXXFLAGS += -DENABLE_TRACE
endif
ifeq ($(NVTX), 0)
CXXFLAGS += -DNVTX_DISABLE
endif
ifneq ($(KEEP), 0)
NVCUFLAGS += -keep
endif
ifneq ($(PROFAPI), 0)
CXXFLAGS += -DPROFAPI
endif
ifneq ($(RDMA_CORE), 0)
CXXFLAGS += -DNCCL_BUILD_RDMA_CORE=1
endif
#
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
# Prerequisite: $(FILESTOFORMAT) contains the list of files of interest for formatting
# As this file defines a new target (format), it should be included at least after the definition of the
# default target.
ASTYLE_FORMAT_OPTS=-Qv --style=java --indent-after-parens --indent-modifiers --indent-switches --indent-continuation=2 --keep-one-line-blocks --keep-one-line-statements --indent=spaces=2 --lineend=linux --suffix=none
ASTYLEDIR := $(BUILDDIR)/contrib
ASTYLETAR := $(ASTYLEDIR)/astyle.tar.gz
ASTYLEBIN := $(ASTYLEDIR)/astyle/build/gcc/bin/astyle
ASTYLEBLD := $(ASTYLEDIR)/astyle/build/gcc/
ASTYLEVER := 3.1
ASTYLEURL := "https://versaweb.dl.sourceforge.net/project/astyle/astyle/astyle%20$(ASTYLEVER)/astyle_$(ASTYLEVER)_linux.tar.gz"
$(ASTYLEDIR) :
@mkdir -p $(ASTYLEDIR)
$(ASTYLETAR) : $(ASTYLEDIR)
@wget -q -O $(ASTYLETAR) $(ASTYLEURL)
$(ASTYLEBLD) : $(ASTYLETAR)
@cd $(ASTYLEDIR) && tar xzf $(ASTYLETAR)
$(ASTYLEBIN) : $(ASTYLEBLD)
${MAKE} -C $(ASTYLEBLD)
.PHONY : format
format : $(ASTYLEBIN)
@$(ASTYLEBIN) $(ASTYLE_FORMAT_OPTS) $(FILESTOFORMAT)
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 18
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
.PHONY : all clean
default : build
build : debian.build txz.build
BUILDDIR ?= $(abspath ../build)
ABSBUILDDIR := $(abspath $(BUILDDIR))
TARGETS := debian txz
all: ${TARGETS:%=%.build}
prep: ${TARGETS:%=%.prep}
build: ${TARGETS:%=%.build}
clean: ${TARGETS:%=%.clean}
%.prep:
${MAKE} -C $* prep BUILDDIR=${ABSBUILDDIR}
%.build:
${MAKE} -C $* build BUILDDIR=${ABSBUILDDIR}
%.clean:
${MAKE} -C $* clean
/*.debhelper.log
/*.debhelper
/*.substvars
/tmp/
/files
/libnccl1/
/libnccl-dev/
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../makefiles/common.mk
include ../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
DEBPREPDIR := $(BUILDDIR)/debian
PKGDIR := $(BUILDDIR)/pkg/deb/
DEBGEN_IN := $(wildcard *.in)
DEBGEN := $(DEBGEN_IN:.in=)
DEBFILES := compat copyright libnccl-dev.install rules $(DEBGEN)
DEBTARGETS := $(patsubst %, $(DEBPREPDIR)/%, $(DEBFILES))
PKG_TIMESTAMP := $(shell date -R)
PKG_ARCH ?= $(shell dpkg-architecture -qDEB_HOST_ARCH)
PKG_MULTIARCH ?= $(shell dpkg-architecture -qDEB_HOST_MULTIARCH)
prep : $(DEBTARGETS)
$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
build : prep
$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
@printf "Building Debian package\n"
(cd $(BUILDDIR); debuild -eLD_LIBRARY_PATH -uc -us -d -b)
mkdir -p $(PKGDIR)
mv $(BUILDDIR)/../libnccl*.deb $(PKGDIR)/
clean:
rm -Rf $(DEBPREPDIR) $(PKGDIR)
$(DEBPREPDIR)/% : %.in
@printf "Generating %-35s > %s\n" $< $@
mkdir -p $(DEBPREPDIR)
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
-e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
-e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
-e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
$< > $@
$(DEBPREPDIR)/% : %
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(DEBPREPDIR)
cp -f $< $@
nccl (${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}) trusty; urgency=medium
* Automatic Debian package from build
-- cudatools <cudatools@nvidia.com> ${pkg:Timestamp}
Source: nccl
Section: libs
Maintainer: cudatools <cudatools@nvidia.com>
Priority: optional
Build-depends: debhelper(>=9)
Standards-Version: 3.9.5
Package: libnccl${nccl:Major}
Section: libs
Architecture: ${pkg:Arch}
Depends: ${misc:Depends}, ${shlibs:Depends}
Description: NVIDIA Collective Communication Library (NCCL) Runtime
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
It has been optimized to achieve high bandwidth on any platform using PCIe,
NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
sockets.
Package: libnccl-dev
Section: libdevel
Architecture: ${pkg:Arch}
Depends: ${misc:Depends}, ${shlibs:Depends}, libnccl${nccl:Major} (= ${binary:Version})
Description: NVIDIA Collective Communication Library (NCCL) Development Files
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
It has been optimized to achieve high bandwidth on any platform using PCIe,
NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
sockets.
[DEFAULT]
debian-branch = master
upstream-branch = master
ignore-new = True
[git-buildpackage]
no-purge = True
include/nccl.h /usr/include
include/nccl_net.h /usr/include
lib/libnccl.so /usr/lib/${pkg:MultiArch}
lib/libnccl_static.a /usr/lib/${pkg:MultiArch}
lib/libnccl.so.${nccl:Major} /usr/lib/${pkg:MultiArch}
lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} /usr/lib/${pkg:MultiArch}
#!/usr/bin/make -f
%:
dh $@ --parallel
override_dh_auto_install:
PREFIX=debian/tmp dh_auto_install
override_dh_auto_test:
# Do not make test
override_dh_auto_clean:
# Do not make clean
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../makefiles/common.mk
include ../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
RPMPREPDIR := $(BUILDDIR)/redhat
PKGDIR := $(BUILDDIR)/pkg/rpm/
RPMGEN_IN := $(wildcard *.in)
RPMGEN := $(RPMGEN_IN:.in=)
RPMFILES := $(RPMGEN)
RPMTARGETS := $(patsubst %, $(RPMPREPDIR)/%, $(RPMFILES))
PKG_TIMESTAMP := $(shell date -R)
ARCH := $(shell uname -m)
PKG_ARCH ?= $(shell uname -m)
PKG_MULTIARCH ?= $(shell $(CXX) -print-multiarch)
ifeq ($(PKG_MULTIARCH),)
# Hardwire the PKG_MULTIARCH directory as the RHEL6 distribution agnostic compiler (gcc 4.8.3) doesn't set it
PKG_MULTIARCH := $(ARCH)-linux-gnu
endif
prep : $(RPMTARGETS)
$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
build : prep
$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
$(MAKE) -C ../txz build BUILDDIR=$(BUILDDIR)
@printf "Building Redhat package\n"
mkdir -p $(PKGDIR)
rpmbuild --define "_sourcedir $(BUILDDIR)/pkg/txz" \
--define "_rpmdir $(PKGDIR)" \
--define "_builddir $(PKGDIR)/build/" \
--define "_buildrootdir $(PKGDIR)/buildroot/" \
-bb $(BUILDDIR)/redhat/nccl.spec
clean:
rm -Rf $(RPMPREPDIR) $(PKGDIR)
$(RPMPREPDIR)/% : %.in
@printf "Generating %-35s > %s\n" $< $@
mkdir -p $(RPMPREPDIR)
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
-e "s/\$${pkg:Timestamp}/$(PKG_TIMESTAMP)/g" \
-e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
-e "s/\$${pkg:MultiArch}/$(PKG_MULTIARCH)/g" \
$< > $@
$(RPMPREPDIR)/% : %
@printf "Grabbing %-35s > %s\n" $< $@
mkdir -p $(RPMPREPDIR)
cp -f $< $@
Name: libnccl
Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}
Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
License: BSD
URL: http://developer.nvidia.com/nccl
Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz
Requires(pre,preun): /sbin/ldconfig
%description
NCCL (pronounced "Nickel") is a stand-alone library of standard collective
communication routines for GPUs, implementing all-reduce, all-gather, reduce,
broadcast, and reduce-scatter.
It has been optimized to achieve high bandwidth on any platform using PCIe,
NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP
sockets.
%package devel
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
%description devel
NCCL development files
%package static
Summary: NVIDIA Collective Communication Library (NCCL) Runtime
Group: Development/Libraries
%description static
NCCL static library
%define debug_package %{nil}
%prep
%setup -n nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch} -q
%build
%install
rm -rf $RPM_BUILD_ROOT
install -m 755 -d $RPM_BUILD_ROOT
install -m 755 -d $RPM_BUILD_ROOT/%{_libdir}
install -m 755 lib/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}
ln -s libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so.${nccl:Major}
# devel
install -m 755 -d $RPM_BUILD_ROOT/%{_includedir}
install -m 644 include/nccl.h $RPM_BUILD_ROOT/%{_includedir}
install -m 644 include/nccl_net.h $RPM_BUILD_ROOT/%{_includedir}
ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so
# static
install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir}
%post -p /sbin/ldconfig
%postun -p /sbin/ldconfig
%post devel -p /sbin/ldconfig
%postun devel -p /sbin/ldconfig
%clean
rm -rf $RPM_BUILD_ROOT
%files devel
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_includedir}/nccl.h
%{_includedir}/nccl_net.h
%{_libdir}/libnccl.so
%files static
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_libdir}/libnccl_static.a
%files
%doc LICENSE.txt
%defattr(-,root,root,-)
%{_libdir}/libnccl.so.${nccl:Major}
%{_libdir}/libnccl.so.${nccl:Major}.${nccl:Minor}.${nccl:Patch}
%changelog
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../makefiles/common.mk
include ../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
TXZPREPDIR := $(BUILDDIR)/srctxz
PKGDIR := $(BUILDDIR)/pkg/srctxz/
TXZGEN_IN := $(wildcard *.in)
TXZGEN := $(TXZGEN_IN:.in=)
TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
PKG_REVISION ?= 3
PKG_ARCH := $(shell uname -m)
prep: $(TXZTARGETS)
build: prep
$(MAKE) -C ../../src clean
@printf "Building source tar.xz package\n"
(cd $(BUILDDIR); bash srctxz/create_srctxz.sh)
mkdir -p $(PKGDIR)
mv $(BUILDDIR)/../../nccl-src*.txz $(PKGDIR)
clean:
rm -Rf $(TXZPREPDIR) $(PKGDIR)
$(TXZPREPDIR)/% : %.in
@printf "Generating %-35s > %s\n" $< $@
mkdir -p $(TXZPREPDIR)
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
$< > $@
#!/bin/bash
#
# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
# To run from $BUILDDIR/
cd ..
NCCLDIR=`basename $PWD`
echo "Checking for unclean directory ..."
git clean -x -i
echo "Clean done."
echo "Checking for uncommited files ..."
if [ "`git status -s | wc -l`" != "0" ]; then
git status -s
echo "Some changes are not committed yet. Continue ? (Ctrl-C to abort)"
read
fi
cd ..
NCCL_MAJOR=${nccl:Major}
NCCL_MINOR=${nccl:Minor}
NCCL_PATCH=${nccl:Patch}
NCCL_SUFFIX=${nccl:Suffix}
NCCL_BUILD=${pkg:Revision}
NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}"
tar --exclude build \
--exclude ".git*" \
--exclude pkg/srctxz \
--transform "s/^$NCCLDIR/$NCCLNAME/" -Jcf $NCCLNAME.txz --owner=0 --group=0 $NCCLDIR
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
include ../../makefiles/common.mk
include ../../makefiles/version.mk
BUILDDIR ?= $(abspath ../../build)
TXZPREPDIR := $(BUILDDIR)/txz
PKGDIR := $(BUILDDIR)/pkg/txz/
TXZGEN_IN := $(wildcard *.in)
TXZGEN := $(TXZGEN_IN:.in=)
TXZTARGETS := $(patsubst %, $(TXZPREPDIR)/%, $(TXZGEN))
PKG_ARCH := $(shell uname -m)
prep: $(TXZTARGETS)
$(MAKE) -C ../.. lic BUILDDIR=$(BUILDDIR)
build: prep
$(MAKE) -C ../.. src.build BUILDDIR=$(BUILDDIR)
@printf "Building tar.xz package\n"
(cd $(BUILDDIR); bash txz/create_txz.sh)
mkdir -p $(PKGDIR)
mv $(BUILDDIR)/../nccl*.txz $(PKGDIR)
clean:
rm -Rf $(TXZPREPDIR) $(PKGDIR)
$(TXZPREPDIR)/% : %.in
@printf "Generating %-35s > %s\n" $< $@
mkdir -p $(TXZPREPDIR)
sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \
-e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \
-e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \
-e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \
-e "s/\$${cuda:Major}/$(CUDA_MAJOR)/g" \
-e "s/\$${cuda:Minor}/$(CUDA_MINOR)/g" \
-e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \
-e "s/\$${pkg:Arch}/$(PKG_ARCH)/g" \
$< > $@
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment