Unverified Commit 023ce416 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.63 (#193)



* Fixing issue with P memory type and use of DMA subexecutor
* CMake builds require explicit opt-in by setting NIC_EXEC_ENABLE=1
* Removing self-GPU check for DMA engine copies
* [BUILD] Add new GPU targets and switch to amdclang++ (#187)
* [BUILD] Add gfx950, gfx1150, and gfx1151 targets
* [BUILD] Modify CMake to use amdclang++
* [BUILD] Modify Makefile to use amdclang++
* [GIT] Updated CHANGELOG and .gitignore
* Adding HBM testing to healthcheck preset
* Tweaking HBM tests to occur first, and provide more info during VERBOSE=1
* Fixing timing reporting issues with NUM_SUBITERATIONS
* [BUILD] Simplify Makefile (#190)
* Combines steps for compilation and linking
* Does not rebuild if no change to source code

* Updating CHANGELOG

---------
Co-authored-by: default avatarNilesh M Negi <Nilesh.Negi@amd.com>
parent a4709f4b
......@@ -7,3 +7,4 @@ _templates/
_toc.yml
docBin/
TransferBench
*.o
......@@ -3,6 +3,19 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.63.00
### Added
- Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds
### Modified
- Removing self-GPU check for DMA engine copies
- Switched to amdclang++ as primary compiler
- healthcheck preset adds HBM testing and support for more MI3XX variants
### Fixed
- Fixed issue when using "P" memory type and specific DMA subengines
- Fixed issue with subiteration timing reports
## v1.62.00
### Added
- Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores
......
# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
else()
set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory")
cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
# CMake Toolchain file to define compilers and path to ROCm
#==================================================================================================
if (NOT CMAKE_TOOLCHAIN_FILE)
set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif()
cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.62.00 LANGUAGES CXX)
set(VERSION_STRING "1.63.00")
project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
## Load CMake modules
#==================================================================================================
include(CheckIncludeFiles)
include(CheckSymbolExists)
include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
# Build options
#==================================================================================================
option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
option(ENABLE_NIC_EXEC "Enable RDMA NIC Executor in TransferBench" OFF)
# Default GPU architectures to build
#==================================================================================================
......@@ -16,15 +32,18 @@ set(DEFAULT_GPUS
gfx908
gfx90a
gfx942
gfx950
gfx1030
gfx1100
gfx1101
gfx1102
gfx1150
gfx1151
gfx1200
gfx1201)
# Build only for local GPU architecture
if (BUILD_LOCAL_GPU_TARGET_ONLY)
## Build only for local GPU architecture
if(BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target")
if (COMMAND rocm_local_targets)
rocm_local_targets(DEFAULT_GPUS)
......@@ -33,10 +52,10 @@ if (BUILD_LOCAL_GPU_TARGET_ONLY)
endif()
endif()
# Determine which GPU architectures to build for
## Determine which GPU architectures to build for
set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
# Check if clang compiler can offload to GPU_TARGETS
## Check if clang compiler can offload to GPU_TARGETS
if (COMMAND rocm_check_target_ids)
message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
......@@ -45,50 +64,124 @@ else()
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif()
set(COMPILING_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.")
message(STATUS "Compiling for ${COMPILING_TARGETS}")
set(GPU_TARGETS "${SUPPORTED_GPUS}")
message(STATUS "Compiling for ${GPU_TARGETS}")
foreach(target ${COMPILING_TARGETS})
list(APPEND static_link_flags --offload-arch=${target})
endforeach()
list(JOIN static_link_flags " " flags_str)
set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
include_directories(${ROCM_PATH}/include)
find_library(IBVERBS_LIBRARY ibverbs)
find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
if (DEFINED ENV{DISABLE_NIC_EXEC})
message(STATUS "Disabling NIC Executor support")
elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs)
# Check for required dependencies
#==================================================================================================
## Try to establish ROCM_PATH (for find_package)
if(NOT DEFINED ROCM_PATH)
# Guess default location
set(ROCM_PATH "/opt/rocm")
message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
else()
if (NOT IBVERBS_LIBRARY)
message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
endif()
set(ENV{ROCM_PATH} ${ROCM_PATH})
## Set CMAKE flags
if (NOT DEFINED CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
list(APPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths for finding HIP / HSA
${ROCM_PATH}
${ROCM_PATH}/llvm
${ROCM_PATH}/hip
/opt/rocm
/opt/rocm/llvm
/opt/rocm/hip)
## Check for HIP
find_package(hip REQUIRED CONFIG PATHS ${CMAKE_PREFIX_PATH})
message(STATUS "HIP compiler: ${HIP_COMPILER}")
## Ensuring that CXX compiler meets expectations
if(NOT (("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc") OR ("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")))
message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.")
endif()
## Check for Threads
find_package(Threads REQUIRED)
set(THREADS_PREFER_PTHREAD_FLAG ON)
## Check for numa support
find_library(NUMA_LIBRARY numa)
find_path(NUMA_INCLUDE_DIR numa.h)
if(NUMA_LIBRARY AND NUMA_INCLUDE_DIR)
add_library(numa SHARED IMPORTED)
set_target_properties(numa PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" IMPORTED_LOCATION "${NUMA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}")
endif()
## Check for hsa support
find_library(HSA_LIBRARY hsa-runtime64 PATHS ${ROCM_PATH} ${ROCM_PATH}/lib)
find_path(HSA_INCLUDE_DIR hsa.h PATHS ${ROCM_PATH}/include ${ROCM_PATH}/include/hsa)
if(HSA_LIBRARY AND HSA_INCLUDE_DIR)
add_library(hsa-runtime64 SHARED IMPORTED)
set_target_properties(hsa-runtime64 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}" IMPORTED_LOCATION "${HSA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}")
endif()
## Check for infiniband verbs support
if(DEFINED ENV{ENABLE_NIC_EXEC} OR DEFINED ENABLE_NIC_EXEC)
message(STATUS "For CMake builds, NIC executor also requires explicit opt-in by setting CMake flag -DENABLE_NIC_EXEC=1 or environment flag ENABLE_NIC_EXEC=1")
find_library(IBVERBS_LIBRARY ibverbs)
find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
if(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
add_library(ibverbs SHARED IMPORTED)
set_target_properties(ibverbs PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" IMPORTED_LOCATION "${IBVERBS_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}")
set(IBVERBS_FOUND 1)
message(STATUS "Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
else()
if(NOT IBVERBS_LIBRARY)
message(WARNING "IBVerbs library not found")
elseif (NOT IBVERBS_INCLUDE_DIR)
elseif(NOT IBVERBS_INCLUDE_DIR)
message(WARNING "infiniband/verbs.h not found")
endif()
message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
endif()
else()
message(STATUS "Disabling NIC Executor support")
message(WARNING "For CMake builds, NIC executor requires explicit opt-in by setting ENABLE_NIC_EXEC=1")
endif()
link_libraries(numa hsa-runtime64 pthread)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp)
target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)
find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
include(ROCMInstallTargets)
include(ROCMCreatePackage)
set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF)
target_include_directories(TransferBench PRIVATE src/header)
target_include_directories(TransferBench PRIVATE src/client)
target_include_directories(TransferBench PRIVATE src/client/Presets)
target_include_directories(TransferBench PRIVATE ${NUMA_INCLUDE_DIR})
target_include_directories(TransferBench PRIVATE ${HSA_INCLUDE_DIR})
if(IBVERBS_FOUND)
target_include_directories(TransferBench PRIVATE ${IBVERBS_INCLUDE_DIR})
target_link_libraries(TransferBench PRIVATE ${IBVERBS_LIBRARY})
target_compile_definitions(TransferBench PRIVATE NIC_EXEC_ENABLED)
endif()
set(PACKAGE_NAME TB)
set(LIBRARY_NAME TransferBench)
target_link_libraries(TransferBench PRIVATE -fgpu-rdc) # Required when linking relocatable device code
target_link_libraries(TransferBench PRIVATE Threads::Threads)
target_link_libraries(TransferBench INTERFACE hip::host)
target_link_libraries(TransferBench PRIVATE hip::device)
target_link_libraries(TransferBench PRIVATE dl)
target_link_libraries(TransferBench PRIVATE ${NUMA_LIBRARY})
target_link_libraries(TransferBench PRIVATE ${HSA_LIBRARY})
rocm_install(TARGETS TransferBench COMPONENT devel)
rocm_setup_version(VERSION ${VERSION_STRING})
rocm_package_add_dependencies(DEPENDS numactl hsa-rocr)
# Package specific CPACK vars
rocm_package_add_dependencies(DEPENDS "numactl" "hsa-rocr")
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
set(PACKAGE_NAME TB)
set(LIBRARY_NAME TransferBench)
rocm_create_package(
NAME ${LIBRARY_NAME}
......
......@@ -6,57 +6,81 @@
ROCM_PATH ?= /opt/rocm
CUDA_PATH ?= /usr/local/cuda
HIPCC=$(ROCM_PATH)/bin/hipcc
NVCC=$(CUDA_PATH)/bin/nvcc
HIPCC ?= $(ROCM_PATH)/bin/amdclang++
NVCC ?= $(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
# This can be a space separated string of multiple GPU targets
# Default is the native GPU target
GPU_TARGETS ?= native
DEBUG ?= 0
ifeq ($(filter clean,$(MAKECMDGOALS)),)
# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda
CXX=$(NVCC)
else
else
EXE=TransferBench
ifeq ("$(shell test -e $(HIPCC) && echo found)", "found")
CXX=$(HIPCC)
endif
else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
CXX=$(ROCM_PATH)/bin/hipcc
$(warning "Could not find $(HIPCC). Using fallback to $(CXX)")
else
$(error "Could not find $(HIPCC) or $(ROCM_PATH)/bin/hipcc. Check if the path is correct if you want to build $(EXE)")
endif
GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
endif
CXXFLAGS = -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
HIPLDFLAGS= -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64
HIPFLAGS = -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
NVFLAGS = -x cu -lnuma -arch=native
ifeq ($(DEBUG), 0)
COMMON_FLAGS += -O3
else
COMMON_FLAGS += -O0 -g -ggdb3
endif
COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread
# Compile RDMA executor if
# 1) DISABLE_NIC_EXEC is not set to 1
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
NIC_ENABLED = 0
ifneq ($(DISABLE_NIC_EXEC),1)
LDFLAGS += -lpthread
# Compile RDMA executor if
# 1) DISABLE_NIC_EXEC is not set to 1
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
DISABLE_NIC_EXEC ?= 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
$(info lib IBVerbs not found)
else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
$(info infiniband/verbs.h not found)
else
LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
CXXFLAGS += -DNIC_EXEC_ENABLED
LDFLAGS += -libverbs
NIC_ENABLED = 1
endif
ifeq ($(NIC_ENABLED), 0)
$(info Building without NIC executor support)
$(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
else
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
endif
endif
endif
.PHONY : all clean
all: $(EXE)
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) $(COMMON_FLAGS) $< -o $@ $(HIPLDFLAGS) $(LDFLAGS)
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
clean:
rm -f *.o ./TransferBench ./TransferBenchCuda
rm -f ./TransferBench ./TransferBenchCuda
NicStatus:
ifeq ($(NIC_ENABLED), 1)
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
else
$(info Building without NIC executor support)
endif
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# Test dependencies
include(FetchContent)
set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "")
# Find or download/install rocm-cmake project
find_package(ROCmCMakeBuildTools 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}")
if((NOT ROCmCMakeBuildTools_FOUND) OR INSTALL_DEPENDENCIES)
message(STATUS "ROCmCMakeBuildTools not found. Checking for ROCM (deprecated)")
find_package(ROCM 0.7.3 CONFIG QUIET PATHS "${ROCM_PATH}") # deprecated fallback
if((NOT ROCM_FOUND) OR INSTALL_DEPENDENCIES)
message(STATUS "ROCM (deprecated) not found. Downloading and building ROCmCMakeBuildTools")
set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
set(rocm_cmake_tag "rocm-6.4.0" CACHE STRING "rocm-cmake tag to download")
FetchContent_Declare(
rocm-cmake
GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git
GIT_TAG ${rocm_cmake_tag}
SOURCE_SUBDIR "DISABLE ADDING TO BUILD"
)
FetchContent_MakeAvailable(rocm-cmake)
message(STATUS "rocm-cmake_SOURCE_DIR: ${rocm-cmake_SOURCE_DIR}")
find_package(ROCmCMakeBuildTools CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
endif()
elseif(ROCmCMakeBuildTools_FOUND)
message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
endif()
# Find available local ROCM targets
# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
function(rocm_local_targets VARIABLE)
set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
execute_process(
COMMAND "${_rocm_agent_enumerator}"
RESULT_VARIABLE _found_agents
OUTPUT_VARIABLE _rocm_agents
ERROR_QUIET
)
if (_found_agents EQUAL 0)
string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
unset(result)
foreach (agent IN LISTS _rocm_agents)
if (NOT agent STREQUAL "gfx000")
list(APPEND result "${agent}")
endif()
endforeach()
if(result)
list(REMOVE_DUPLICATES result)
set(${VARIABLE} "${result}" PARENT_SCOPE)
endif()
endif()
endif()
endfunction()
include(ROCMSetupVersion)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
include(ROCMInstallSymlinks)
include(ROCMCheckTargetIds)
include(ROCMClients)
include(ROCMHeaderWrapper)
......@@ -131,9 +131,8 @@ public:
int defaultGfxUnroll = 4;
if (archName == "gfx906") defaultGfxUnroll = 8;
else if (archName == "gfx90a") defaultGfxUnroll = 8;
else if (archName == "gfx940") defaultGfxUnroll = 6;
else if (archName == "gfx941") defaultGfxUnroll = 6;
else if (archName == "gfx942") defaultGfxUnroll = 4;
else if (archName == "gfx950") defaultGfxUnroll = 4;
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
......
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......@@ -20,57 +20,150 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
enum {
HBM_READ = 0,
HBM_WRITE = 1,
HBM_COPY = 2,
HBM_ADD = 3,
NUM_HBM_TESTS = 4
} HbmTests;
void HealthCheckPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
struct HbmTestConfig
{
// Check for supported platforms
#if defined(__NVCC__)
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
return;
#endif
bool hasFail = false;
// Force use of single stream
ev.useSingleStream = 1;
std::string name;
int numInputs;
int numOutputs;
};
HbmTestConfig HbmTestConfigs[NUM_HBM_TESTS] =
{ {"READ", 1, 0},
{"WRITE", 0, 1},
{"COPY", 1, 1},
{"ADD", 2, 1}
};
typedef struct
{
double unidirHostToDeviceCopyLimit;
double unidirDeviceToHostCopyLimit;
double bidirDmaCopyLimit;
int a2aUnrollFactor;
int a2aNumSubExecs;
double a2aCopyLimit;
int hbmBlockSize [NUM_HBM_TESTS];
int hbmUnrollFactor[NUM_HBM_TESTS];
int hbmTemporalMode[NUM_HBM_TESTS];
double hbmLimit [NUM_HBM_TESTS];
} TestConfig;
typedef enum
{
MODEL_08_GFX0942_304 = 0,
MODEL_08_GFX0942_064 = 1,
NUM_SUPPORTED_MODELS = 2
} ModelEnum;
// All limits are scaled by this factor
#define SFACTOR 0.97
TestConfig Config_08_GFX0942_304 = {
.unidirHostToDeviceCopyLimit = 50,
.unidirDeviceToHostCopyLimit = 50,
.bidirDmaCopyLimit = 90,
.a2aUnrollFactor = 2,
.a2aNumSubExecs = 8,
.a2aCopyLimit = 45,
.hbmBlockSize = { 384, 256, 320, 256},
.hbmUnrollFactor = { 7, 4, 8, 7},
.hbmTemporalMode = { 3, 3, 3, 3},
.hbmLimit = {4980, 4850, 2045, 1405},
};
TestConfig Config_08_GFX0942_064 = {
.unidirHostToDeviceCopyLimit = 50,
.unidirDeviceToHostCopyLimit = 50,
.bidirDmaCopyLimit = 90,
.a2aUnrollFactor = 2,
.a2aNumSubExecs = 8,
.a2aCopyLimit = 45,
.hbmBlockSize = { 448, 448, 448, 384},
.hbmUnrollFactor = { 8, 3, 8, 7},
.hbmTemporalMode = { 3, 3, 3, 3},
.hbmLimit = {4180, 2800, 1400, 1055},
};
TestConfig TestConfigs[NUM_SUPPORTED_MODELS] =
{
Config_08_GFX0942_304,
Config_08_GFX0942_064,
};
TransferBench::TestResults results;
int DetectModel()
{
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numGpuDevices != 8) {
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
exit(1);
}
std::string archName = "";
int numSubExecutors = 0;
// Loop over all GPUs and determine if they are identical
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
// Check that arch name is identical
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
{
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
std::string currArchName = fullName.substr(0, fullName.find(':'));
if (archName != "" && archName != currArchName) {
printf("[WARN] healthcheck preset is currently only supported when all GPUs are identical\n");
printf(" Detected both %s and %s\n", archName.c_str(), currArchName.c_str());
exit(1);
}
archName = currArchName;
// Check number of subexecutors
int currNumSubExecutors = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
if (numSubExecutors != 0 && numSubExecutors != currNumSubExecutors) {
printf("[WARN] healthcheck preset is currently only supported when all GPUs are identical\n");
printf(" Detected different subexecutor counts: %d and %d\n", numSubExecutors, currNumSubExecutors);
exit(1);
}
numSubExecutors = currNumSubExecutors;
}
// Pass limits
double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
double a2aLimit = getenv("LIMIT_A2A") ? atof(getenv("LIMIT_A2A")) : (int)(45 * 0.95);
// Classify based on detected configuration
if (numGpuDevices == 8) {
if (archName == "gfx942") {
switch (numSubExecutors) {
case 304: return MODEL_08_GFX0942_304;
case 64: return MODEL_08_GFX0942_064;
}
}
}
printf("[WARN] healthcheck preset is currently not supported on this hardware\n");
printf(" Detected %d x [%s] with [%d] subexecutors per GPU\n", numGpuDevices, archName.c_str(), numSubExecutors);
exit(1);
}
int TestUnidir(int modelId, bool verbose)
{
TestConfig const& testConfig = TestConfigs[modelId];
TransferBench::ConfigOptions cfg;
TransferBench::TestResults results;
// Run CPU to GPU
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
cfg.dma.useHsaCopy = 1;
// Run unidirectional read from CPU to GPU
printf("Testing unidirectional reads from CPU ");
// Run unidirectional host to device copy
printf("Testing unidirectional host to device copy%c", verbose ? '\n' : ' ');
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
double limit = testConfig.unidirHostToDeviceCopyLimit * SFACTOR;
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
if (!verbose) printf(".");
fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
......@@ -80,48 +173,43 @@ void HealthCheckPreset(EnvVars& ev,
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId};
t.numBytes = 64*1024*1024;
t.exeDevice = {EXE_GPU_DMA, gpuId};
t.numBytes = 256*1024*1024;
t.srcs = {{MEM_CPU, memIndex}};
t.dsts = {};
t.dsts = {{MEM_GPU, gpuId}};
t.numSubExecs = 1;
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
if (measuredBw < limit) {
fails.push_back(std::make_pair(gpuId, measuredBw));
}
if (verbose) printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, limit);
}
}
}
// Run unidirectional write from GPU to CPU
printf("Testing unidirectional writes to CPU ");
// Run unidirectional device to host copy
printf("Testing unidirectional device to host copy%c", verbose ? '\n' : ' ');
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
double limit = testConfig.unidirDeviceToHostCopyLimit * SFACTOR;
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
if (!verbose) printf(".");
fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
......@@ -131,48 +219,53 @@ void HealthCheckPreset(EnvVars& ev,
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId};
t.numBytes = 64*1024*1024;
t.srcs = {};
t.exeDevice = {EXE_GPU_DMA, gpuId};
t.numBytes = 256*1024*1024;
t.srcs = {{MEM_GPU, gpuId}};
t.dsts = {{MEM_CPU, memIndex}};
t.numSubExecs = 1;
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
if (measuredBw < limit) {
fails.push_back(std::make_pair(gpuId, measuredBw));
}
if (verbose) printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, limit);
}
}
}
return hasFail;
}
int TestBidir(int modelId, bool verbose)
{
TestConfig const& testConfig = TestConfigs[modelId];
TransferBench::ConfigOptions cfg;
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Run bidirectional tests
printf("Testing bidirectional reads + writes ");
printf("Testing bidirectional host<->device copies%c", verbose ? '\n' : ' ');
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
double limit = testConfig.bidirDmaCopyLimit * SFACTOR;
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
if (!verbose) printf(".");
fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
......@@ -184,62 +277,65 @@ void HealthCheckPreset(EnvVars& ev,
Transfer& t0 = transfers[0];
Transfer& t1 = transfers[1];
t0.exeDevice = {EXE_GPU_GFX, gpuId};
t0.numBytes = 64*1024*1024;
t0.srcs = {{MEM_CPU, memIndex}};
t0.dsts = {};
t0.exeDevice = {EXE_GPU_DMA, gpuId};
t0.numBytes = 256*1024*1024;
t0.srcs = {{MEM_GPU, gpuId}};
t0.dsts = {{MEM_CPU, memIndex}};
t0.numSubExecs = 1;
t1.exeDevice = {EXE_GPU_GFX, gpuId};
t1.numBytes = 64*1024*1024;
t1.srcs = {};
t1.dsts = {{MEM_CPU, memIndex}};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t0.numSubExecs = cu;
t1.numSubExecs = cu;
t1.exeDevice = {EXE_GPU_DMA, gpuId};
t1.numBytes = 256*1024*1024;
t1.srcs = {{MEM_CPU, memIndex}};
t1.dsts = {{MEM_GPU, gpuId}};
t1.numSubExecs = 1;
TransferBench::TestResults results;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
double measuredBw = (results.tfrResults[0].avgBandwidthGbPerSec +
results.tfrResults[1].avgBandwidthGbPerSec);
bestResult = std::max(bestResult, sum);
if (sum >= bdirLimit) {
passed = true;
break;
if (measuredBw < limit) {
fails.push_back(std::make_pair(gpuId, measuredBw));
}
if (verbose) printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
} else {
PrintErrors(results.errResults);
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, limit);
}
}
}
return hasFail;
}
// Run XGMI tests:
printf("Testing all-to-all XGMI copies "); fflush(stdout);
int TestAllToAll(int modelId, bool verbose)
{
TestConfig const& testConfig = TestConfigs[modelId];
TransferBench::ConfigOptions cfg;
cfg.gfx.unrollFactor = testConfig.a2aUnrollFactor;
int numSubExecs = testConfig.a2aNumSubExecs;
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
printf("Testing all-to-all XGMI copies %c", verbose ? '\n' : ' '); fflush(stdout);
{
// Force GFX unroll to 2 for MI300
ev.gfxUnroll = 2;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
double limit = testConfig.a2aCopyLimit * SFACTOR;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpuDevices; i++) {
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue;
Transfer t;
t.numBytes = 64*1024*1024;
t.numSubExecs = 8;
t.numBytes = 256*1024*1024;
t.numSubExecs = numSubExecs;
t.exeDevice = {EXE_GPU_GFX, i};
t.srcs = {{MEM_GPU_FINE, i}};
t.dsts = {{MEM_GPU_FINE, j}};
......@@ -247,17 +343,18 @@ void HealthCheckPreset(EnvVars& ev,
}
}
std::vector<std::pair<std::pair<int,int>, double>> fails;
TransferBench::TestResults results;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
int transferIdx = 0;
for (int i = 0; i < numGpuDevices; i++) {
printf("."); fflush(stdout);
if (!verbose) printf("."); fflush(stdout);
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue;
double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
if (bw < a2aLimit) {
if (bw < limit) {
fails.push_back(std::make_pair(std::make_pair(i,j), bw));
}
if (verbose) printf(" GPU %02d to GPU %02d: : Measured %6.2f Limit %6.2f\n", i, j, bw, limit);
transferIdx++;
}
}
......@@ -265,12 +362,111 @@ void HealthCheckPreset(EnvVars& ev,
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, limit);
}
}
}
exit(hasFail ? 1 : 0);
return hasFail;
}
int TestHbmPerformance(int modelId, bool verbose)
{
TestConfig const& testConfig = TestConfigs[modelId];
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
char testname[50];
for (int testId = 0; testId < NUM_HBM_TESTS; testId++) {
TransferBench::ConfigOptions cfg;
cfg.general.numIterations = 1000;
cfg.general.numWarmups = 50;
cfg.gfx.blockSize = testConfig.hbmBlockSize[testId];
cfg.gfx.unrollFactor = testConfig.hbmUnrollFactor[testId];
cfg.gfx.temporalMode = testConfig.hbmTemporalMode[testId];
sprintf(testname, "Testing HBM performance [%s]", HbmTestConfigs[testId].name.c_str());
if (verbose) printf("[Blocksize: %d Unroll: %d TemporalMode: %d]\n", cfg.gfx.blockSize, cfg.gfx.unrollFactor, cfg.gfx.temporalMode);
printf("%-42s%c", testname, verbose ? '\n' : ' ');
fflush(stdout);
int numInputs = HbmTestConfigs[testId].numInputs;
int numOutputs = HbmTestConfigs[testId].numOutputs;
double limit = testConfig.hbmLimit[testId] * SFACTOR;
std::vector<std::pair<int, double>> fails;
TransferBench::TestResults results;
std::vector<Transfer> transfers;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
Transfer t;
t.numSubExecs = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
t.numBytes = 16777216ULL * t.numSubExecs;
t.exeDevice = {EXE_GPU_GFX, gpuId};
for (int i = 0; i < numInputs; i++) t.srcs.push_back({MEM_GPU, gpuId});
for (int i = 0; i < numOutputs; i++) t.dsts.push_back({MEM_GPU, gpuId});
transfers.push_back(t);
}
if (TransferBench::RunTransfers(cfg, transfers, results)) {
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
if (!verbose) printf(".");
fflush(stdout);
double measuredBw = results.tfrResults[gpuId].avgBandwidthGbPerSec;
if (measuredBw < limit) {
fails.push_back(std::make_pair(gpuId, measuredBw));
}
if (verbose) printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
}
} else {
PrintErrors(results.errResults);
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, limit);
}
}
}
return hasFail;
}
void HealthCheckPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
// Check for supported platforms
#if defined(__NVCC__)
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
return;
#endif
printf("Disclaimer:\n");
printf("==================================================================\n");
printf("NOTE: This is an experimental feature and may be subject to change\n");
printf(" Failures do not necessarily indicate hardware issues, as other factors\n");
printf(" such as simultaneous workloads may influence results\n");
printf("\n");
// Collect custom env vars for this preset
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
// Determine if this is a supported model
int modelId = DetectModel();
// Run through all tests
int numFails = 0;
numFails += TestHbmPerformance(modelId, verbose);
numFails += TestUnidir(modelId, verbose);
numFails += TestBidir(modelId, verbose);
numFails += TestAllToAll(modelId, verbose);
exit(numFails ? 1 : 0);
}
......@@ -66,7 +66,7 @@ namespace TransferBench
using std::set;
using std::vector;
constexpr char VERSION[] = "1.62";
constexpr char VERSION[] = "1.63";
/**
* Enumeration of supported Executor types
......@@ -516,7 +516,7 @@ namespace TransferBench
//==========================================================================================
// Macro for collecting CU/SM GFX kernel is running on
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__)
#define GetHwId(hwId) hwId = 0
#elif defined(__NVCC__)
#define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
......@@ -525,7 +525,7 @@ namespace TransferBench
#endif
// Macro for collecting XCC GFX kernel is running on
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
#if defined(__gfx942__) || defined(__gfx950__)
#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
#else
#define GetXccId(val) val = 0
......@@ -755,7 +755,7 @@ namespace {
#if defined (__NVCC__)
return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"};
#else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocCoherent));
#endif
} else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
#if defined (__NVCC__)
......@@ -895,6 +895,8 @@ namespace {
// Get the hsa_agent_t associated with a MemDevice
static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
{
if (memDevice.memType == MEM_CPU_CLOSEST)
return GetHsaAgent({EXE_CPU, GetClosestCpuNumaToGpu(memDevice.memIndex)}, agent);
if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
return {ERR_FATAL,
......@@ -1191,6 +1193,8 @@ namespace {
if (err.errType == ERR_FATAL) break;
}
// Skip check of engine Id mask for self copies
if (srcAgent.handle != dstAgent.handle) {
uint32_t engineIdMask = 0;
err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
if (err.errType != ERR_NONE) {
......@@ -1203,6 +1207,7 @@ namespace {
"Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
i, t.exeDevice.exeIndex, t.exeSubIndex});
}
}
#endif
}
......@@ -2624,7 +2629,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
childThreads.clear();
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
double deltaMsec = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0) / cfg.general.numSubIterations;
if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec;
......@@ -2654,7 +2659,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
asyncTransfer.join();
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec;
return ERR_NONE;
......@@ -2692,20 +2698,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (numaNode != -1)
numa_run_on_node(numaNode);
}
auto transferCount = exeInfo.resources.size();
std::vector<double> totalTimeMsec(transferCount, 0.0);
int subIterations = 0;
do {
auto cpuStart = std::chrono::high_resolution_clock::now();
size_t completedTransfers = 0;
auto transferCount = exeInfo.resources.size();
std::vector<uint8_t> receivedQPs(transferCount);
std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
do {
std::vector<uint8_t> receivedQPs(transferCount, 0);
// post the sends
for (auto i = 0; i < transferCount; i++) {
transferTimers[i] = std::chrono::high_resolution_clock::now();
ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
}
// poll for completions
do {
size_t completedTransfers = 0;
while (completedTransfers < transferCount) {
for (auto i = 0; i < transferCount; i++) {
if(receivedQPs[i] < exeInfo.resources[i].qpCount) {
auto& rss = exeInfo.resources[i];
......@@ -2725,20 +2735,28 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i];
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(deltaMsec);
totalTimeMsec[i] += deltaMsec;
}
completedTransfers++;
}
}
}
} while(completedTransfers < transferCount);
}
} while(++subIterations < cfg.general.numSubIterations);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0)
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) {
exeInfo.totalDurationMsec += deltaMsec;
} while(++subIterations < cfg.general.numSubIterations);
for (int i = 0; i < transferCount; i++) {
auto& rss = exeInfo.resources[i];
double transferTimeMsec = totalTimeMsec[i] / cfg.general.numSubIterations;
rss.totalDurationMsec += transferTimeMsec;
if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(transferTimeMsec);
}
}
return ERR_NONE;
}
#endif
......@@ -3077,14 +3095,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_CHECK(hipStreamSynchronize(stream));
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec;
if (startEvent != NULL) {
float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec;
deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
}
rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) {
......@@ -3154,12 +3172,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_CHECK(hipStreamSynchronize(stream));
}
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0
/ cfg.general.numSubIterations;
if (iteration >= 0) {
if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
gpuDeltaMsec /= cfg.general.numSubIterations;
exeInfo.totalDurationMsec += gpuDeltaMsec;
} else {
exeInfo.totalDurationMsec += cpuDeltaMsec;
......@@ -3182,7 +3202,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
}
double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
deltaMsec /= cfg.general.numSubIterations;
rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) {
rss.perIterMsec.push_back(deltaMsec);
......@@ -3249,14 +3269,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#endif
}
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec;
if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec;
deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
}
resources.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration)
......@@ -3291,7 +3311,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_CHECK(asyncTransfer.get());
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec;
return ERR_NONE;
......@@ -3493,7 +3513,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Stop CPU timing for this iteration
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() / cfg.general.numSubIterations;
if (cfg.data.alwaysValidate) {
ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
......@@ -3528,7 +3548,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
results.tfrResults.resize(transfers.size());
results.numTimedIterations = numTimedIterations;
results.totalBytesTransferred = 0;
results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / (numTimedIterations * cfg.general.numSubIterations);
results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / numTimedIterations;
results.overheadMsec = results.avgTotalDurationMsec;
for (auto& exeInfoPair : executorMap) {
ExeDevice const& exeDevice = exeInfoPair.first;
......@@ -3537,7 +3557,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Copy over executor results
ExeResult& exeResult = results.exeResults[exeDevice];
exeResult.numBytes = exeInfo.totalBytes;
exeResult.avgDurationMsec = exeInfo.totalDurationMsec / (numTimedIterations * cfg.general.numSubIterations);
exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations;
exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec;
exeResult.sumBandwidthGbPerSec = 0.0;
exeResult.transferIdx.clear();
......
if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to the ROCm installation.")
set(rocm_bin "$ENV{ROCM_PATH}/bin")
else()
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
set(rocm_bin "/opt/rocm/bin")
endif()
if (NOT DEFINED ENV{CXX})
if(EXISTS "${rocm_bin}/amdclang++")
set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
else()
if(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++")
set(rocm_bin "${ROCM_PATH}/llvm/bin")
set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++")
set(rocm_bin "${ROCM_PATH}/llvm/bin")
set(CMAKE_CXX_COMPILER "${rocm_bin}/clang++" CACHE PATH "Path to the C++ compiler")
endif()
endif()
else()
set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
endif()
if (NOT DEFINED ENV{CXXFLAGS})
set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
endif()
if(NOT CMAKE_BUILD_TYPE)
message(STATUS "Setting build type to 'Release' as none was specified.")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
endif()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment