Commit c0dad530 authored by wangkaixiong's avatar wangkaixiong 🚴🏼
Browse files

init

parents
#!/bin/bash
mpirun -H master:8,node1:8 --prefix /opt/mpi -np 16 --allow-run-as-root \
--output-filename t1 \
-x NCCL_DEBUG=TRACE \
-x UCX_NET_DEVICES=mlx5_2:1 \
-x NCCL_TOPO_FILE=/data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml \
-x NCCL_NET_GDR_READ=1 \
./build2/all_reduce_perf -b 7618 -e 1G -f 2 -g 1
#!/bin/bash
export NCCL_TOPO_DUMP_FILE=./topo-dump.xml
#-x NCCL_TOPO_FILE=./topo.xml \
mpirun -H master:8,node1:8 --prefix /opt/mpi -np 16 --allow-run-as-root \
--mca plm_rsh_args "-p 2222" \
--mca btl_tcp_if_include p14p2 \
-x ROCM_PATH -x LD_LIBRARY_PATH \
-x NCCL_TOPO_FILE=./topo-0507-115-update.xml \
-x NCCL_DEBUG=WARN \
-x NCCL_SOCKET_IFNAME=p14p2 \
-x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
-x NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_1:1,mlx5_10:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 \
./build/alltoall_perf -b 7618 -e 1G -f 2 -g 1 -d half
#-x NCCL_TOPO_FILE=/data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml \
#-x NCCL_IB_QP_PER_CONNECTION=4 \
#-x NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_1:1,mlx5_10:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 \
# -x NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \
# --mca plm rsh \
# --mca plm_rsh_agent ssh \
# -x NCCL_GRAPH_FILE=./topo.xml \
# -x NCCL_ALGO=tree,ring \
# --mca plm_rsh_args "-2 -o StrictHostKeyChecking=no" \ # 使用 SSH 通信
# -x NCCL_DEBUG=INFO # 输出调试日志,定位问题
# -x NCCL_IB_GID_INDEX=3 # 若 IB 多子网,指定 GID 索引
mpirun --prefix /opt/mpi -np 16 -H master:8,node1:8 --allow-run-as-root --mca plm_rsh_args "-p 2222" --mca btl_tcp_if_include p14p2 -x ROCM_PATH -x LD_LIBRARY_PATH ./runrccl.sh
#!/bin/bash
#export NCCL_GRAPH_DUMP_FILE=./graph-dump.xml
#export NCCL_TOPO_DUMP_FILE=./topo-0515-dump.xml
#export NCCL_TOPO_DUMP_FILE=./topo-wkx-exp.xml
#export NCCL_MAX_NCHANNELS=24
export NCCL_MIN_NCHANNELS=32
export NCCL_MIN_P2P_NCHANNELS=32
#export NCCL_P2P_LEVEL=7
export NCCL_ALGO=Ring
#export NCCL_MIN_P2P_NCHANNELS=24
#export NCCL_NCHANNELS_PER_PEER=24
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_SOCKET_IFNAME=p14p2
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export NCCL_TOPO_FILE=./topo-0507-115-update.xml
export NCCL_P2P_LEVEL=SYS
export NCCL_TOPO_FILE=./topo-BW-0520.xml
export RCCL_SDMA_COUNT_ENABLE=1
export RCCL_SDMA_COPY_ENABLE=0
export RCCL_COLL_XHCL_CHANNEL_NUM=28 \
#export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1
#export NCCL_TOPO_FILE=null
#export NCCL_GRAPH_FILE=./graph_debug.xml
#export NCCL_DEBUG=TRACE
#export NCCL_SHM_DISABLE=1
#export NCCL_P2P_DISABLE=1
#export NCCL_GRAPH_FILE=./graph_debug.xml
#export NCCL_IB_HCA=mlx5_7:1,mlx5_9:1,mlx5_10:1 # MLX5_10 没流量
#export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1 # MLX5_10 没流量
#./build/alltoall_perf -b 2 -e 2G -f 2 -g 1
./build/all_reduce_perf -b 256M -e 256M -f 2 -g 8
#./build/all_reduce_perf -b 2 -e 2G -f 2 -g 1
#mpirun -np 16 -H master:1,node1:8 --allow-run-as-root -x NCCL_TOPO_FILE=./topo-0507-115-real.xml --mca plm_rsh_args "-p 2222" -x NCCL_MAX_NCHANNELS=20 -x NCCL_MIN_NCHANNELS=20 -x NCCL_P2P_LEVEL=SYS -x NCCL_ALGO=Ring -x NCCL_MIN_P2P_NCHANNELS=20 -x NCCL_NCHANNELS_PER_PEER=20 -x HSA_FORCE_FINE_GRAIN_PCIE=1 -x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./build/all_reduce_perf -b 2 -e 2G -f 2 -g 1
#mpirun -np 8 --allow-run-as-root -x NCCL_TOPO_FILE=./topo-0507-115-real.xml -x NCCL_MAX_NCHANNELS=20 -x NCCL_MIN_NCHANNELS=20 -x NCCL_P2P_LEVEL=SYS -x NCCL_ALGO=Ring -x HSA_FORCE_FINE_GRAIN_PCIE=1 -x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./build/all_reduce_perf -b 2 -e 2G -f 2 -g 1
#-x NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1 \
#-x NCCL_TOPO_FILE=/data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml \
#-x NCCL_IB_QP_PER_CONNECTION=4 \
#-x NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_1:1,mlx5_10:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 \
# -x NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \
# --mca plm rsh \
# --mca plm_rsh_agent ssh \
# -x NCCL_GRAPH_FILE=./topo.xml \
# -x NCCL_ALGO=tree,ring \
# --mca plm_rsh_args "-2 -o StrictHostKeyChecking=no" \ # 使用 SSH 通信
# -x NCCL_DEBUG=INFO # 输出调试日志,定位问题
# -x NCCL_IB_GID_INDEX=3 # 若 IB 多子网,指定 GID 索引
#!/bin/bash
#export NCCL_GRAPH_DUMP_FILE=./graph-dump.xml
#export NCCL_TOPO_DUMP_FILE=./topo-0515-dump.xml
#export NCCL_TOPO_DUMP_FILE=topo-tujie-test.xml #./topo-wkx-exp.xml
#export NCCL_MAX_NCHANNELS=24
export NCCL_MIN_NCHANNELS=24
export NCCL_MIN_P2P_NCHANNELS=24
#export NCCL_P2P_LEVEL=7
#export NCCL_ALGO=Ring
#export NCCL_MIN_P2P_NCHANNELS=24
#export NCCL_NCHANNELS_PER_PEER=24
export HSA_FORCE_FINE_GRAIN_PCIE=1
export NCCL_SOCKET_IFNAME=p14p2
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export NCCL_TOPO_FILE=./topo-0507-115-update.xml
export NCCL_P2P_LEVEL=SYS
export NCCL_TOPO_FILE=./topo-BW-0520.xml
#export NCCL_TOPO_FILE=/data1/sunzhq/tujie/rccl-test/rccl/topo-0507_0520.xml
#export RCCL_SDMA_COUNT_ENABLE=1
#export RCCL_SDMA_COPY_ENABLE=0
#export RCCL_COLL_XHCL_CHANNEL_NUM=28 \
#export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1
#export NCCL_TOPO_FILE=null
#export NCCL_GRAPH_FILE=./graph_debug.xml
#export NCCL_DEBUG=TRACE
#export NCCL_SHM_DISABLE=1
#export NCCL_P2P_DISABLE=1
#export NCCL_GRAPH_FILE=./graph_debug.xml
#export NCCL_IB_HCA=mlx5_7:1,mlx5_9:1,mlx5_10:1 # MLX5_10 没流量
#export NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1 # MLX5_10 没流量
#./build/alltoall_perf -b 2 -e 2G -f 2 -g 1
./build/all_reduce_perf -b 256M -e 256M -f 2 -g 1
#./build/sendrecv_perf -b 2 -e 1g -f 2 -g 1
#./build/all_reduce_perf -b 2 -e 2G -f 2 -g 1
#mpirun -np 16 -H master:1,node1:8 --allow-run-as-root -x NCCL_TOPO_FILE=./topo-0507-115-real.xml --mca plm_rsh_args "-p 2222" -x NCCL_MAX_NCHANNELS=20 -x NCCL_MIN_NCHANNELS=20 -x NCCL_P2P_LEVEL=SYS -x NCCL_ALGO=Ring -x NCCL_MIN_P2P_NCHANNELS=20 -x NCCL_NCHANNELS_PER_PEER=20 -x HSA_FORCE_FINE_GRAIN_PCIE=1 -x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./build/all_reduce_perf -b 2 -e 2G -f 2 -g 1
#mpirun -np 8 --allow-run-as-root -x NCCL_TOPO_FILE=./topo-0507-115-real.xml -x NCCL_MAX_NCHANNELS=20 -x NCCL_MIN_NCHANNELS=20 -x NCCL_P2P_LEVEL=SYS -x NCCL_ALGO=Ring -x HSA_FORCE_FINE_GRAIN_PCIE=1 -x HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./build/all_reduce_perf -b 2 -e 2G -f 2 -g 1
#-x NCCL_IB_HCA=mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_7:1,mlx5_8:1,mlx5_9:1,mlx5_10:1 \
#-x NCCL_TOPO_FILE=/data1/sunzhq/rccl-tests-develop/topo-0507-115-real.xml \
#-x NCCL_IB_QP_PER_CONNECTION=4 \
#-x NCCL_IB_HCA=mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_1:1,mlx5_10:1,mlx5_7:1,mlx5_8:1,mlx5_9:1 \
# -x NCCL_IB_HCA=mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_7,mlx5_8,mlx5_9,mlx5_10 \
# --mca plm rsh \
# --mca plm_rsh_agent ssh \
# -x NCCL_GRAPH_FILE=./topo.xml \
# -x NCCL_ALGO=tree,ring \
# --mca plm_rsh_args "-2 -o StrictHostKeyChecking=no" \ # 使用 SSH 通信
# -x NCCL_DEBUG=INFO # 输出调试日志,定位问题
# -x NCCL_IB_GID_INDEX=3 # 若 IB 多子网,指定 GID 索引
# ########################################################################
# Copyright 2022 Advanced Micro Devices, Inc.
# ########################################################################
# Compile common object library
set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
target_link_libraries(rccl_common roc::rccl hip::device)
if(USE_MPI)
target_link_libraries(rccl_common MPI::MPI_CXX)
endif()
function(add_relative_test test_name test_target)
get_target_property(EXE_PATH ${test_target} RUNTIME_OUTPUT_DIRECTORY)
if(EXE_PATH STREQUAL "EXE_PATH-NOTFOUND")
set(EXE_PATH ".")
endif()
get_filename_component(EXE_PATH "${EXE_PATH}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
get_target_property(EXE_NAME ${test_target} RUNTIME_OUTPUT_NAME)
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
get_target_property(EXE_NAME ${test_target} OUTPUT_NAME)
if(EXE_NAME STREQUAL "EXE_NAME-NOTFOUND")
set(EXE_NAME "${test_target}")
endif()
endif()
file(RELATIVE_PATH rel_path "${CMAKE_CURRENT_BINARY_DIR}" "${EXE_PATH}/${EXE_NAME}")
add_test(NAME "${test_name}" COMMAND "./${rel_path}")
endfunction()
function(add_rccl_test TEST)
set(TEST_SOURCE "${TEST}.cu")
set_property(SOURCE ${TEST_SOURCE} PROPERTY LANGUAGE CXX)
set(TEST_TARGET "${TEST}_perf")
add_executable(${TEST_TARGET} ${TEST_SOURCE})
target_link_libraries(
${TEST_TARGET}
PRIVATE
rccl_common
)
set_target_properties(
${TEST_TARGET}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
# LINKER_LANGUAGE CXX
)
add_relative_test(${TEST} ${TEST_TARGET})
rocm_install(TARGETS ${TEST_TARGET})
# TODO: copy/install DLLs on Windows
set_target_properties(
${TEST_TARGET} PROPERTIES
INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib;${ROCM_PATH}/lib"
)
endfunction()
add_rccl_test(all_gather)
add_rccl_test(all_reduce)
add_rccl_test(alltoall)
add_rccl_test(alltoallv)
add_rccl_test(broadcast)
add_rccl_test(gather)
add_rccl_test(hypercube)
add_rccl_test(reduce_scatter)
add_rccl_test(reduce)
add_rccl_test(scatter)
add_rccl_test(sendrecv)
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
ROCM_PATH ?= /opt/rocm
MPI_HOME ?= /usr/lib/openmpi
PREFIX ?= /usr/local
VERBOSE ?= 0
DEBUG ?= 0
NCCL_HOME ?= ""
HIPCC = $(ROCM_PATH)/bin/hipcc
CXX = $(HIPCC)
HIPCUFLAGS := -std=c++14
LDFLAGS :=
HIPLDFLAGS :=
ifneq ($(NCCL_HOME), "")
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
endif
HIPCUFLAGS += -I$(ROCM_PATH)/include
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
ifeq ($(DEBUG), 0)
HIPCUFLAGS += -O3
else
HIPCUFLAGS += -O0 -g -ggdb3
endif
ifeq ($(VERBOSE), 0)
.SILENT:
endif
.PHONY: build clean
BUILDDIR ?= ../build
ifeq ($(MPI), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
else ifeq ($(MPICH), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
HIPLDFLAGS += -L/usr/lib -lmpich
endif
LIBRARIES += rccl
HIPLDFLAGS += $(LIBRARIES:%=-l%)
DST_DIR := $(BUILDDIR)
SRC_FILES := $(wildcard *.cu)
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
build: ${BIN_FILES}
clean:
rm -rf ${DST_DIR}
TEST_VERIFIABLE_SRCDIR := ../verifiable
TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
include ../verifiable/verifiable.mk
${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
${DST_DIR}/timer.o: timer.cc timer.h
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
@printf "Linking %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
#define ALIGN 4
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base;
*recvcount = base*nranks;
*sendInplaceOffset = base;
*recvInplaceOffset = 0;
*paramcount = base;
}
testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void AllGatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks - 1))/((double)nranks);
*busBw = baseBw * factor;
}
testResult_t AllGatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclAllGather(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
struct testColl allGatherTest = {
"AllGather",
AllGatherGetCollByteCount,
AllGatherInitData,
AllGatherGetBw,
AllGatherRunColl
};
void AllGatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
AllGatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &allGatherTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
AllGatherGetBuffSize,
AllGatherRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
}
testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k = 0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void AllReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(2*(nranks - 1)))/((double)nranks);
*busBw = baseBw * factor;
}
testResult_t AllReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclAllReduce(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
struct testColl allReduceTest = {
"AllReduce",
AllReduceGetCollByteCount,
AllReduceInitData,
AllReduceGetBw,
AllReduceRunColl
};
void AllReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
AllReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t AllReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &allReduceTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if ((int)op != -1) {
op_count = 1;
run_ops = &op;
run_opnames = &opName;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}
for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
AllReduceGetBuffSize,
AllReduceRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = (count/nranks)*nranks;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = count/nranks;
}
testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
size_t partcount = sendcount/nranks;
TESTCHECK(InitData(((char*)args->expected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
// We don't support in-place alltoall
args->reportErrors = in_place ? 0 : 1;
return testSuccess;
}
void AlltoAllGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks-1))/((double)(nranks));
*busBw = baseBw * factor;
}
testResult_t AlltoAllRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclAllToAll(sendbuff, recvbuff, count, type, comm, stream));
return testSuccess;
}
struct testColl alltoAllTest = {
"AlltoAll",
AlltoAllGetCollByteCount,
AlltoAllInitData,
AlltoAllGetBw,
AlltoAllRunColl
};
void AlltoAllGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
AlltoAllGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &alltoAllTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
AlltoAllGetBuffSize,
AlltoAllRunTest
};
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
#define USE_RCCL_GATHER_SCATTER
void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
if (count < nranks*nranks/2) {
*sendcount = 0;
*recvcount = 0;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = 0;
} else {
*sendcount = (count/nranks)*nranks;
*recvcount = (count/nranks)*nranks;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = count/nranks;
}
}
testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
#if 0
int *dataHost = (int *)malloc(args->sendBytes);
hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
printf(" Rank [%d] Original: ", rank);
for(int j=0; j<sendcount; j++) {
printf("%d:%d ", j, dataHost[j]);
}
printf("\n");
free(dataHost);
#endif
size_t rdisp = 0;
size_t data_count = sendcount*2/nranks;
size_t chunksize = data_count/nranks;
for (int j=0; j<nranks; j++) {
size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
if ((j+rank)%nranks == 0)
rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
size_t sdisp = 0;
for (int kk=0; kk<nranks; kk++) {
scount = ((kk+j)%nranks)*chunksize;
if ((kk+j)%nranks == 0)
scount += (sendcount-chunksize*(nranks-1)*nranks/2);
if (kk == rank)
break;
sdisp += scount;
}
TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
rdisp += rcount;
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
// We don't support in-place alltoall
args->reportErrors = in_place ? 0 : 1;
return testSuccess;
}
void AlltoAllvGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks-1))/((double)(nranks));
*busBw = baseBw * factor;
}
testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int nranks;
NCCLCHECK(ncclCommCount(comm, &nranks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
if (count == 0) return testSuccess;
size_t *sendcounts, *recvcounts, *sdispls, *rdispls;
sendcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
recvcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
sdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
rdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
if (sendcounts == nullptr || recvcounts == nullptr || sdispls == nullptr || rdispls == nullptr) {
printf("failed to allocate buffers for alltoallv\n");
return testNcclError;
}
size_t disp = 0;
size_t chunksize = count*2/nranks;
for (int i = 0; i < nranks; i++) {
size_t scount = ((i+rank)%nranks)*chunksize;
if ((i+rank)%nranks == 0)
scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
sendcounts[i+rank*nranks] = recvcounts[i+rank*nranks] = scount;
sdispls[i+rank*nranks] = rdispls[i+rank*nranks] = disp;
disp += scount;
//printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*nranks]*wordSize(type), sdispls[i+rank*nranks]*wordSize(type));
}
#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
printf("NCCL 2.7 or later is needed for alltoallv. This test was compiled with %d.%d.\n", NCCL_MAJOR, NCCL_MINOR);
return testNcclError;
#else
#if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER)
NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*nranks, sdispls+rank*nranks, recvbuff, recvcounts+rank*nranks, rdispls+rank*nranks, type, comm, stream));
#else
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nranks; r++) {
if (sendcounts[r+rank*nranks] != 0) {
NCCLCHECK(ncclSend(
((char*)sendbuff) + sdispls[r+rank*nranks] * wordSize(type),
sendcounts[r+rank*nranks],
type,
r,
comm,
stream));
}
if (recvcounts[r+rank*nranks] != 0) {
NCCLCHECK(ncclRecv(
((char*)recvbuff) + rdispls[r+rank*nranks] * wordSize(type),
recvcounts[r+rank*nranks],
type,
r,
comm,
stream));
}
}
NCCLCHECK(ncclGroupEnd());
#endif
#endif
free(sendcounts);
free(recvcounts);
free(sdispls);
free(rdispls);
return testSuccess;
}
struct testColl alltoAllTest = {
"AlltoAllv",
AlltoAllvGetCollByteCount,
AlltoAllvInitData,
AlltoAllvGetBw,
AlltoAllvRunColl
};
void AlltoAllvGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
AlltoAllvGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t AlltoAllvRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &alltoAllTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = ncclNumTypes;
run_types = test_types;
run_typenames = test_typenames;
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
AlltoAllvGetBuffSize,
AlltoAllvRunTest
};
/*************************************************************************
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
}
testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void BroadcastGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = 1;
*busBw = baseBw * factor;
}
testResult_t BroadcastRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
#if NCCL_MAJOR >= 2 && NCCL_MINOR >= 2
NCCLCHECK(ncclBroadcast(sendbuff, recvbuff, count, type, root, comm, stream));
#else
if (rank == root) {
NCCLCHECK(ncclBcast(sendbuff, count, type, root, comm, stream));
} else {
NCCLCHECK(ncclBcast(recvbuff, count, type, root, comm, stream));
}
#endif
return testSuccess;
}
struct testColl broadcastTest = {
"Broadcast",
BroadcastGetCollByteCount,
BroadcastInitData,
BroadcastGetBw,
BroadcastRunColl
};
void BroadcastGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
BroadcastGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &broadcastTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
int begin_root, end_root;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 0;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
BroadcastGetBuffSize,
BroadcastRunTest
};
This diff is collapsed.
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef __COMMON_H__
#define __COMMON_H__
#include "rccl/rccl.h"
#include <stdio.h>
#include <cstdint>
#include <algorithm>
#ifdef MPI_SUPPORT
#include "mpi.h"
#endif
#include <pthread.h>
#include "nccl1_compat.h"
#include "timer.h"
// For nccl.h < 2.13 since we define a weak fallback
extern "C" char const* ncclGetLastError(ncclComm_t comm);
#define HIPCHECK(cmd) do { \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test HIP failure %s:%d '%s'\n", \
hostname, \
__FILE__,__LINE__,hipGetErrorString(e)); \
return testCudaError; \
} \
} while(0)
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
#define NCCLCHECK(cmd) do { \
ncclResult_t res = cmd; \
if (res != ncclSuccess) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test NCCL failure %s:%d " \
"'%s / %s'\n", \
hostname,__FILE__,__LINE__, \
ncclGetErrorString(res), \
ncclGetLastError(NULL)); \
return testNcclError; \
} \
} while(0)
#else
#define NCCLCHECK(cmd) do { \
ncclResult_t res = cmd; \
if (res != ncclSuccess) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test NCCL failure %s:%d '%s'\n", \
hostname, \
__FILE__,__LINE__,ncclGetErrorString(res)); \
return testNcclError; \
} \
} while(0)
#endif
typedef enum {
testSuccess = 0,
testInternalError = 1,
testCudaError = 2,
testNcclError = 3,
testTimeout = 4,
testNumResults = 5
} testResult_t;
// Relay errors up and trace
#define TESTCHECK(cmd) do { \
testResult_t r = cmd; \
if (r!= testSuccess) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf(" .. %s pid %d: Test failure %s:%d\n", \
hostname, getpid(), \
__FILE__,__LINE__); \
return r; \
} \
} while(0)
struct testColl {
const char name[20];
void (*getCollByteCount)(
size_t *sendcount, size_t *recvcount, size_t *paramcount,
size_t *sendInplaceOffset, size_t *recvInplaceOffset,
size_t count, int nranks);
testResult_t (*initData)(struct threadArgs* args, ncclDataType_t type,
ncclRedOp_t op, int root, int rep, int in_place);
void (*getBw)(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks);
testResult_t (*runColl)(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream);
};
extern struct testColl allReduceTest;
extern struct testColl allGatherTest;
extern struct testColl reduceScatterTest;
extern struct testColl broadcastTest;
extern struct testColl reduceTest;
extern struct testColl alltoAllTest;
struct testEngine {
void (*getBuffSize)(size_t *sendcount, size_t *recvcount, size_t count, int nranks);
testResult_t (*runTest)(struct threadArgs* args, int root, ncclDataType_t type,
const char* typeName, ncclRedOp_t op, const char* opName);
};
extern struct testEngine ncclTestEngine;
struct threadArgs {
size_t nbytes;
size_t minbytes;
size_t maxbytes;
size_t stepbytes;
size_t stepfactor;
int totalProcs;
int nProcs;
int proc;
int nThreads;
int thread;
int nGpus;
int* gpus;
int localRank;
int localNumDevices;
int enable_multiranks;
int enable_out_of_place;
int nRanks;
void** sendbuffs;
size_t sendBytes;
size_t sendInplaceOffset;
void** recvbuffs;
size_t recvInplaceOffset;
ncclUniqueId ncclId;
ncclComm_t* comms;
hipStream_t* streams;
void** expected;
size_t expectedBytes;
int* errors;
double* bw;
int* bw_count;
int reportErrors;
struct testColl* collTest;
};
typedef testResult_t (*threadFunc_t)(struct threadArgs* args);
struct testThread {
pthread_t thread;
threadFunc_t func;
struct threadArgs args;
testResult_t ret;
};
// Provided by common.cu
extern void Barrier(struct threadArgs* args);
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
#include <unistd.h>
static void getHostName(char* hostname, int maxlen) {
gethostname(hostname, maxlen);
for (int i=0; i< maxlen; i++) {
if (hostname[i] == '.') {
hostname[i] = '\0';
return;
}
}
}
#include <stdint.h>
static uint64_t getHash(const char* string, size_t n) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (size_t c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
}
return result;
}
/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
*
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
*
*/
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
static uint64_t getHostHash(const char* hostname) {
char hostHash[1024];
// Fall back is the hostname if something fails
(void) strncpy(hostHash, hostname, sizeof(hostHash));
int offset = strlen(hostHash);
FILE *file = fopen(HOSTID_FILE, "r");
if (file != NULL) {
char *p;
if (fscanf(file, "%ms", &p) == 1) {
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
free(p);
}
}
fclose(file);
// Make sure the string is terminated
hostHash[sizeof(hostHash)-1]='\0';
return getHash(hostHash, strlen(hostHash));
}
static size_t wordSize(ncclDataType_t type) {
switch(type) {
case ncclChar:
#if NCCL_MAJOR >= 2
//case ncclInt8:
case ncclUint8:
#endif
return 1;
case ncclHalf:
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
case ncclBfloat16:
#endif
//case ncclFloat16:
return 2;
case ncclInt:
case ncclFloat:
#if NCCL_MAJOR >= 2
//case ncclInt32:
case ncclUint32:
//case ncclFloat32:
#endif
return 4;
case ncclInt64:
case ncclUint64:
case ncclDouble:
//case ncclFloat64:
return 8;
default: return 0;
}
}
extern int test_ncclVersion; // init'd with ncclGetVersion()
typedef enum { ncclCoarse = 0,
ncclFine = 1,
ncclHost = 2,
ncclManaged = 3,
nccl_NUM_MTYPES = 4 } ncclMemoryType_t;
extern const char *test_memorytypes[nccl_NUM_MTYPES];
constexpr int test_opNumMax = (int)ncclNumOps + (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) ? 1 : 0);
extern int test_opnum;
extern int test_typenum;
extern ncclDataType_t test_types[ncclNumTypes];
extern const char *test_typenames[ncclNumTypes];
extern ncclRedOp_t test_ops[];
extern const char *test_opnames[];
static int ncclstringtotype(char *str) {
for (int t=0; t<ncclNumTypes; t++) {
if (strcmp(str, test_typenames[t]) == 0) {
return t;
}
}
if (strcmp(str, "all") == 0) {
return -1;
}
printf("invalid type %s, defaulting to %s .. \n", str, test_typenames[ncclFloat]);
return ncclFloat;
}
static int ncclstringtoop (char *str) {
for (int o=0; o<test_opnum; o++) {
if (strcmp(str, test_opnames[o]) == 0) {
return o;
}
}
if (strcmp(str, "all") == 0) {
return -1;
}
printf("invalid op %s, defaulting to %s .. \n", str, test_opnames[ncclSum]);
return ncclSum;
}
static int ncclstringtomtype (char *str) {
for (int o=0; o<nccl_NUM_MTYPES; o++) {
if (strcmp(str, test_memorytypes[o]) == 0) {
return o;
}
}
printf("invalid memorytype %s, defaulting to %s .. \n", str, test_memorytypes[ncclCoarse]);
return ncclCoarse;
}
extern int is_main_proc;
extern thread_local int is_main_thread;
#define PRINT if (is_main_thread) printf
#endif
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "hip/hip_runtime.h"
#include "common.h"
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count/nranks;
*recvcount = (count/nranks)*nranks;
*sendInplaceOffset = count/nranks;
*recvInplaceOffset = 0;
*paramcount = count/nranks;
}
testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
if (rank == root) {
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
}
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void GatherGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * nranks * typesize) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks-1))/((double)(nranks));
*busBw = baseBw * factor;
}
testResult_t GatherRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
size_t rankOffset = count * wordSize(type);
if (count == 0) return testSuccess;
NCCLCHECK(ncclGroupStart());
NCCLCHECK(ncclSend(sendbuff, count, type, root, comm, stream));
if (rank == root) {
for (int r=0; r<nRanks; r++) {
NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, type, r, comm, stream));
}
}
NCCLCHECK(ncclGroupEnd());
return testSuccess;
}
struct testColl gatherTest = {
"Gather",
GatherGetCollByteCount,
GatherInitData,
GatherGetBw,
GatherRunColl
};
void GatherGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
GatherGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &gatherTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
int begin_root, end_root;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 0;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
GatherGetBuffSize,
GatherRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "hip/hip_runtime.h"
#include "common.h"
#define ALIGN 4
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base;
*recvcount = base*nranks;
*sendInplaceOffset = base;
*recvInplaceOffset = 0;
*paramcount = base;
}
testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void HyperCubeGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize * (nranks - 1)) / 1.0E9 / sec;
*algBw = baseBw;
double factor = 1;
*busBw = baseBw * factor;
}
testResult_t HyperCubeRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
char* sbuff = (char*)sendbuff;
char* rbuff = (char*)recvbuff;
int nRanks;
NCCLCHECK(ncclCommCount(comm, &nRanks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
size_t rankSize = count * wordSize(type);
if (rbuff+rank*rankSize != sbuff) HIPCHECK(hipMemcpyAsync(rbuff+rank*rankSize, sbuff, rankSize, hipMemcpyDeviceToDevice, stream));
// Hypercube AllGather
for (int mask=1; mask<nRanks; mask<<=1) {
NCCLCHECK(ncclGroupStart());
int s = rank & ~(mask-1);
int r = s ^ mask;
NCCLCHECK(ncclSend(rbuff+s*rankSize, count*mask, type, rank^mask, comm, stream));
NCCLCHECK(ncclRecv(rbuff+r*rankSize, count*mask, type, rank^mask, comm, stream));
NCCLCHECK(ncclGroupEnd());
}
return testSuccess;
}
struct testColl hyperCubeTest = {
"HyperCube",
HyperCubeGetCollByteCount,
HyperCubeInitData,
HyperCubeGetBw,
HyperCubeRunColl
};
void HyperCubeGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
HyperCubeGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &hyperCubeTest;
ncclDataType_t *run_types;
const char **run_typenames;
int type_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
// Check if this is a power of 2
int nRanks = args->nProcs*args->nThreads*args->nGpus;
if (nRanks && !(nRanks & (nRanks - 1))) {
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
}
} else {
printf("nRanks %d is not a power of 2, skipping\n", nRanks);
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
HyperCubeGetBuffSize,
HyperCubeRunTest
};
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef NCCL1_COMPAT_H
#define NCCL1_COMPAT_H
#ifndef NCCL_MAJOR // NCCL 1.x
#define NCCL_MAJOR 1
#define NCCL_MINOR 0
#define ncclNumOps nccl_NUM_OPS
#define ncclNumTypes nccl_NUM_TYPES
static ncclResult_t ncclGroupStart() { return ncclSuccess; }
static ncclResult_t ncclGroupEnd() { return ncclSuccess; }
#define CHECKCOUNT(count) if (count > INT_MAX) return ncclInvalidArgument;
static ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
CHECKCOUNT(count);
return ncclReduce(sendbuff, recvbuff, (int)count, datatype, op, root, comm, stream);
}
static ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, hipStream_t stream) {
CHECKCOUNT(count);
return ncclAllReduce(sendbuff, recvbuff, (int)count, datatype, op, comm, stream);
}
static ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
ncclComm_t comm, hipStream_t stream) {
CHECKCOUNT(count);
return ncclBcast(buff, (int)count, datatype, root, comm, stream);
}
static ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
hipStream_t stream) {
CHECKCOUNT(recvcount);
return ncclReduceScatter(sendbuff, recvbuff, (int)recvcount, datatype, op, comm, stream);
}
static ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
CHECKCOUNT(sendcount);
return ncclAllGather(sendbuff, (int)sendcount, datatype, recvbuff, comm, stream);
}
#endif
#endif
/**
* MIT License
*
* Copyright 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
/*!\file
* \brief rccl_bfloat16.h provides struct for rccl_bfloat16 typedef
*/
#ifndef _RCCL_BFLOAT16_H_
#define _RCCL_BFLOAT16_H_
#if __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
// If this is a C compiler, C++ compiler below C++11, or a host-only compiler, we only
// include a minimal definition of rccl_bfloat16
#include <stdint.h>
/*! \brief Struct to represent a 16 bit brain floating point number. */
typedef struct
{
uint16_t data;
} rccl_bfloat16;
#else // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__) && !defined(__HIP_PLATFORM_HCC__))
#include <cmath>
#include <cstddef>
#include <cstdint>
#include <hip/hip_runtime.h>
#include <ostream>
#include <type_traits>
struct rccl_bfloat16
{
uint16_t data;
enum truncate_t
{
truncate
};
__host__ __device__ rccl_bfloat16() = default;
// round upper 16 bits of IEEE float to convert to bfloat16
explicit __host__ __device__ rccl_bfloat16(float f)
: data(float_to_bfloat16(f))
{
}
explicit __host__ __device__ rccl_bfloat16(float f, truncate_t)
: data(truncate_float_to_bfloat16(f))
{
}
// zero extend lower 16 bits of bfloat16 to convert to IEEE float
__host__ __device__ operator float() const
{
union
{
uint32_t int32;
float fp32;
} u = {uint32_t(data) << 16};
return u.fp32;
}
private:
static __host__ __device__ uint16_t float_to_bfloat16(float f)
{
union
{
float fp32;
uint32_t int32;
} u = {f};
if(~u.int32 & 0x7f800000)
{
// When the exponent bits are not all 1s, then the value is zero, normal,
// or subnormal. We round the bfloat16 mantissa up by adding 0x7FFF, plus
// 1 if the least significant bit of the bfloat16 mantissa is 1 (odd).
// This causes the bfloat16's mantissa to be incremented by 1 if the 16
// least significant bits of the float mantissa are greater than 0x8000,
// or if they are equal to 0x8000 and the least significant bit of the
// bfloat16 mantissa is 1 (odd). This causes it to be rounded to even when
// the lower 16 bits are exactly 0x8000. If the bfloat16 mantissa already
// has the value 0x7f, then incrementing it causes it to become 0x00 and
// the exponent is incremented by one, which is the next higher FP value
// to the unrounded bfloat16 value. When the bfloat16 value is subnormal
// with an exponent of 0x00 and a mantissa of 0x7F, it may be rounded up
// to a normal value with an exponent of 0x01 and a mantissa of 0x00.
// When the bfloat16 value has an exponent of 0xFE and a mantissa of 0x7F,
// incrementing it causes it to become an exponent of 0xFF and a mantissa
// of 0x00, which is Inf, the next higher value to the unrounded value.
u.int32 += 0x7fff + ((u.int32 >> 16) & 1); // Round to nearest, round to even
}
else if(u.int32 & 0xffff)
{
// When all of the exponent bits are 1, the value is Inf or NaN.
// Inf is indicated by a zero mantissa. NaN is indicated by any nonzero
// mantissa bit. Quiet NaN is indicated by the most significant mantissa
// bit being 1. Signaling NaN is indicated by the most significant
// mantissa bit being 0 but some other bit(s) being 1. If any of the
// lower 16 bits of the mantissa are 1, we set the least significant bit
// of the bfloat16 mantissa, in order to preserve signaling NaN in case
// the bloat16's mantissa bits are all 0.
u.int32 |= 0x10000; // Preserve signaling NaN
}
return uint16_t(u.int32 >> 16);
}
// Truncate instead of rounding, preserving SNaN
static __host__ __device__ uint16_t truncate_float_to_bfloat16(float f)
{
union
{
float fp32;
uint32_t int32;
} u = {f};
return uint16_t(u.int32 >> 16) | (!(~u.int32 & 0x7f800000) && (u.int32 & 0xffff));
}
};
typedef struct
{
uint16_t data;
} rccl_bfloat16_public;
static_assert(std::is_standard_layout<rccl_bfloat16>{},
"rccl_bfloat16 is not a standard layout type, and thus is "
"incompatible with C.");
static_assert(std::is_trivial<rccl_bfloat16>{},
"rccl_bfloat16 is not a trivial type, and thus is "
"incompatible with C.");
static_assert(sizeof(rccl_bfloat16) == sizeof(rccl_bfloat16_public)
&& offsetof(rccl_bfloat16, data) == offsetof(rccl_bfloat16_public, data),
"internal rccl_bfloat16 does not match public rccl_bfloat16");
inline std::ostream& operator<<(std::ostream& os, const rccl_bfloat16& bf16)
{
return os << float(bf16);
}
inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a)
{
return a;
}
inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a)
{
a.data ^= 0x8000;
return a;
}
inline __host__ __device__ rccl_bfloat16 operator+(rccl_bfloat16 a, rccl_bfloat16 b)
{
return rccl_bfloat16(float(a) + float(b));
}
inline __host__ __device__ rccl_bfloat16 operator-(rccl_bfloat16 a, rccl_bfloat16 b)
{
return rccl_bfloat16(float(a) - float(b));
}
inline __host__ __device__ rccl_bfloat16 operator*(rccl_bfloat16 a, rccl_bfloat16 b)
{
return rccl_bfloat16(float(a) * float(b));
}
inline __host__ __device__ rccl_bfloat16 operator/(rccl_bfloat16 a, rccl_bfloat16 b)
{
return rccl_bfloat16(float(a) / float(b));
}
inline __host__ __device__ bool operator<(rccl_bfloat16 a, rccl_bfloat16 b)
{
return float(a) < float(b);
}
inline __host__ __device__ bool operator==(rccl_bfloat16 a, rccl_bfloat16 b)
{
return float(a) == float(b);
}
inline __host__ __device__ bool operator>(rccl_bfloat16 a, rccl_bfloat16 b)
{
return b < a;
}
inline __host__ __device__ bool operator<=(rccl_bfloat16 a, rccl_bfloat16 b)
{
return !(a > b);
}
inline __host__ __device__ bool operator!=(rccl_bfloat16 a, rccl_bfloat16 b)
{
return !(a == b);
}
inline __host__ __device__ bool operator>=(rccl_bfloat16 a, rccl_bfloat16 b)
{
return !(a < b);
}
inline __host__ __device__ rccl_bfloat16& operator+=(rccl_bfloat16& a, rccl_bfloat16 b)
{
return a = a + b;
}
inline __host__ __device__ rccl_bfloat16& operator-=(rccl_bfloat16& a, rccl_bfloat16 b)
{
return a = a - b;
}
inline __host__ __device__ rccl_bfloat16& operator*=(rccl_bfloat16& a, rccl_bfloat16 b)
{
return a = a * b;
}
inline __host__ __device__ rccl_bfloat16& operator/=(rccl_bfloat16& a, rccl_bfloat16 b)
{
return a = a / b;
}
inline __host__ __device__ rccl_bfloat16& operator++(rccl_bfloat16& a)
{
return a += rccl_bfloat16(1.0f);
}
inline __host__ __device__ rccl_bfloat16& operator--(rccl_bfloat16& a)
{
return a -= rccl_bfloat16(1.0f);
}
inline __host__ __device__ rccl_bfloat16 operator++(rccl_bfloat16& a, int)
{
rccl_bfloat16 orig = a;
++a;
return orig;
}
inline __host__ __device__ rccl_bfloat16 operator--(rccl_bfloat16& a, int)
{
rccl_bfloat16 orig = a;
--a;
return orig;
}
namespace std
{
constexpr __host__ __device__ bool isinf(rccl_bfloat16 a)
{
return !(~a.data & 0x7f80) && !(a.data & 0x7f);
}
constexpr __host__ __device__ bool isnan(rccl_bfloat16 a)
{
return !(~a.data & 0x7f80) && +(a.data & 0x7f);
}
constexpr __host__ __device__ bool iszero(rccl_bfloat16 a)
{
return !(a.data & 0x7fff);
}
inline rccl_bfloat16 sin(rccl_bfloat16 a)
{
return rccl_bfloat16(sinf(float(a)));
}
inline rccl_bfloat16 cos(rccl_bfloat16 a)
{
return rccl_bfloat16(cosf(float(a)));
}
}
#endif // __cplusplus < 201103L || (!defined(__HCC__) && !defined(__HIPCC__))
#endif // _RCCL_BFLOAT16_H_
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
*sendInplaceOffset = 0;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
}
testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void ReduceGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize) / 1.0E9 / sec;
*algBw = baseBw;
*busBw = baseBw;
}
testResult_t ReduceRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclReduce(sendbuff, recvbuff, count, type, op, root, comm, stream));
return testSuccess;
}
struct testColl reduceTest = {
"Reduce",
ReduceGetCollByteCount,
ReduceInitData,
ReduceGetBw,
ReduceRunColl
};
void ReduceGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
ReduceGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t ReduceRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &reduceTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
int begin_root, end_root;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if ((int)op != -1) {
op_count = 1;
run_ops = &op;
run_opnames = &opName;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}
if (root != -1) {
begin_root = end_root = root;
} else {
begin_root = 0;
end_root = args->nProcs*args->nThreads*args->nGpus-1;
}
for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
for (int k=begin_root; k<=end_root; k++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], k));
}
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
ReduceGetBuffSize,
ReduceRunTest
};
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <hip/hip_runtime.h>
#include "common.h"
#define ALIGN 4
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base*nranks;
*recvcount = base;
*sendInplaceOffset = 0;
*recvInplaceOffset = base;
*paramcount = base;
}
testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
size_t sendcount = args->sendBytes / wordSize(type);
size_t recvcount = args->expectedBytes / wordSize(type);
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
int k=0;
for (int i=0; i<args->nGpus; i++) {
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
k++;
}
HIPCHECK(hipDeviceSynchronize());
}
return testSuccess;
}
void ReduceScatterGetBw(size_t count, int typesize, double sec, double* algBw, double* busBw, int nranks) {
double baseBw = (double)(count * typesize * nranks) / 1.0E9 / sec;
*algBw = baseBw;
double factor = ((double)(nranks - 1))/((double)nranks);
*busBw = baseBw * factor;
}
testResult_t ReduceScatterRunColl(void* sendbuff, void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, hipStream_t stream) {
NCCLCHECK(ncclReduceScatter(sendbuff, recvbuff, count, type, op, comm, stream));
return testSuccess;
}
struct testColl reduceScatterTest = {
"ReduceScatter",
ReduceScatterGetCollByteCount,
ReduceScatterInitData,
ReduceScatterGetBw,
ReduceScatterRunColl
};
void ReduceScatterGetBuffSize(size_t *sendcount, size_t *recvcount, size_t count, int nranks) {
size_t paramcount, sendInplaceOffset, recvInplaceOffset;
ReduceScatterGetCollByteCount(sendcount, recvcount, &paramcount, &sendInplaceOffset, &recvInplaceOffset, count, nranks);
}
testResult_t ReduceScatterRunTest(struct threadArgs* args, int root, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName) {
args->collTest = &reduceScatterTest;
ncclDataType_t *run_types;
ncclRedOp_t *run_ops;
const char **run_typenames, **run_opnames;
int type_count, op_count;
if ((int)type != -1) {
type_count = 1;
run_types = &type;
run_typenames = &typeName;
} else {
type_count = test_typenum;
run_types = test_types;
run_typenames = test_typenames;
}
if ((int)op != -1) {
run_ops = &op;
run_opnames = &opName;
op_count = 1;
} else {
op_count = test_opnum;
run_ops = test_ops;
run_opnames = test_opnames;
}
for (int i=0; i<type_count; i++) {
for (int j=0; j<op_count; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], run_ops[j], run_opnames[j], -1));
}
}
return testSuccess;
}
struct testEngine ncclTestEngine = {
ReduceScatterGetBuffSize,
ReduceScatterRunTest
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment