Unverified Commit 023ce416 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.63 (#193)



* Fixing issue with P memory type and use of DMA subexecutor
* CMake builds require explicit opt-in by setting NIC_EXEC_ENABLE=1
* Removing self-GPU check for DMA engine copies
* [BUILD] Add new GPU targets and switch to amdclang++ (#187)
* [BUILD] Add gfx950, gfx1150, and gfx1151 targets
* [BUILD] Modify CMake to use amdclang++
* [BUILD] Modify Makefile to use amdclang++
* [GIT] Updated CHANGELOG and .gitignore
* Adding HBM testing to healthcheck preset
* Tweaking HBM tests to occur first, and provide more info during VERBOSE=1
* Fixing timing reporting issues with NUM_SUBITERATIONS
* [BUILD] Simplify Makefile (#190)
* Combines steps for compilation and linking
* Does not rebuild if no change to source code

* Updating CHANGELOG

---------
Co-authored-by: default avatarNilesh M Negi <Nilesh.Negi@amd.com>
parent a4709f4b
...@@ -7,3 +7,4 @@ _templates/ ...@@ -7,3 +7,4 @@ _templates/
_toc.yml _toc.yml
docBin/ docBin/
TransferBench TransferBench
*.o
...@@ -3,6 +3,19 @@ ...@@ -3,6 +3,19 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.63.00
### Added
- Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds
### Modified
- Removing self-GPU check for DMA engine copies
- Switched to amdclang++ as primary compiler
- healthcheck preset adds HBM testing and support for more MI3XX variants
### Fixed
- Fixed issue when using "P" memory type and specific DMA subengines
- Fixed issue with subiteration timing reports
## v1.62.00 ## v1.62.00
### Added ### Added
- Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores - Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores
......
# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
if (DEFINED ENV{ROCM_PATH}) cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
else() # CMake Toolchain file to define compilers and path to ROCm
set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory") #==================================================================================================
if (NOT CMAKE_TOOLCHAIN_FILE)
set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif() endif()
cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.62.00 LANGUAGES CXX) set(VERSION_STRING "1.63.00")
project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
## Load CMake modules
#==================================================================================================
include(CheckIncludeFiles)
include(CheckSymbolExists)
include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
# Build options
#==================================================================================================
option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
option(ENABLE_NIC_EXEC "Enable RDMA NIC Executor in TransferBench" OFF)
# Default GPU architectures to build # Default GPU architectures to build
#================================================================================================== #==================================================================================================
...@@ -16,15 +32,18 @@ set(DEFAULT_GPUS ...@@ -16,15 +32,18 @@ set(DEFAULT_GPUS
gfx908 gfx908
gfx90a gfx90a
gfx942 gfx942
gfx950
gfx1030 gfx1030
gfx1100 gfx1100
gfx1101 gfx1101
gfx1102 gfx1102
gfx1150
gfx1151
gfx1200 gfx1200
gfx1201) gfx1201)
# Build only for local GPU architecture ## Build only for local GPU architecture
if (BUILD_LOCAL_GPU_TARGET_ONLY) if(BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target") message(STATUS "Building only for local GPU target")
if (COMMAND rocm_local_targets) if (COMMAND rocm_local_targets)
rocm_local_targets(DEFAULT_GPUS) rocm_local_targets(DEFAULT_GPUS)
...@@ -33,10 +52,10 @@ if (BUILD_LOCAL_GPU_TARGET_ONLY) ...@@ -33,10 +52,10 @@ if (BUILD_LOCAL_GPU_TARGET_ONLY)
endif() endif()
endif() endif()
# Determine which GPU architectures to build for ## Determine which GPU architectures to build for
set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.") set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
# Check if clang compiler can offload to GPU_TARGETS ## Check if clang compiler can offload to GPU_TARGETS
if (COMMAND rocm_check_target_ids) if (COMMAND rocm_check_target_ids)
message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}") message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS}) rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
...@@ -45,53 +64,127 @@ else() ...@@ -45,53 +64,127 @@ else()
set(SUPPORTED_GPUS ${DEFAULT_GPUS}) set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif() endif()
set(COMPILING_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.") set(GPU_TARGETS "${SUPPORTED_GPUS}")
message(STATUS "Compiling for ${COMPILING_TARGETS}") message(STATUS "Compiling for ${GPU_TARGETS}")
foreach(target ${COMPILING_TARGETS}) ## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
list(APPEND static_link_flags --offload-arch=${target}) include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
endforeach()
list(JOIN static_link_flags " " flags_str)
set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib") # Check for required dependencies
include_directories(${ROCM_PATH}/include) #==================================================================================================
find_library(IBVERBS_LIBRARY ibverbs) ## Try to establish ROCM_PATH (for find_package)
find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h) if(NOT DEFINED ROCM_PATH)
if (DEFINED ENV{DISABLE_NIC_EXEC}) # Guess default location
message(STATUS "Disabling NIC Executor support") set(ROCM_PATH "/opt/rocm")
elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR) message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs)
else() else()
if (NOT IBVERBS_LIBRARY) message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
message(WARNING "IBVerbs library not found") endif()
elseif (NOT IBVERBS_INCLUDE_DIR) set(ENV{ROCM_PATH} ${ROCM_PATH})
message(WARNING "infiniband/verbs.h not found")
## Set CMAKE flags
if (NOT DEFINED CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
list(APPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths for finding HIP / HSA
${ROCM_PATH}
${ROCM_PATH}/llvm
${ROCM_PATH}/hip
/opt/rocm
/opt/rocm/llvm
/opt/rocm/hip)
## Check for HIP
find_package(hip REQUIRED CONFIG PATHS ${CMAKE_PREFIX_PATH})
message(STATUS "HIP compiler: ${HIP_COMPILER}")
## Ensuring that CXX compiler meets expectations
if(NOT (("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc") OR ("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")))
message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.")
endif()
## Check for Threads
find_package(Threads REQUIRED)
set(THREADS_PREFER_PTHREAD_FLAG ON)
## Check for numa support
find_library(NUMA_LIBRARY numa)
find_path(NUMA_INCLUDE_DIR numa.h)
if(NUMA_LIBRARY AND NUMA_INCLUDE_DIR)
add_library(numa SHARED IMPORTED)
set_target_properties(numa PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" IMPORTED_LOCATION "${NUMA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}")
endif()
## Check for hsa support
find_library(HSA_LIBRARY hsa-runtime64 PATHS ${ROCM_PATH} ${ROCM_PATH}/lib)
find_path(HSA_INCLUDE_DIR hsa.h PATHS ${ROCM_PATH}/include ${ROCM_PATH}/include/hsa)
if(HSA_LIBRARY AND HSA_INCLUDE_DIR)
add_library(hsa-runtime64 SHARED IMPORTED)
set_target_properties(hsa-runtime64 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}" IMPORTED_LOCATION "${HSA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}")
endif()
## Check for infiniband verbs support
if(DEFINED ENV{ENABLE_NIC_EXEC} OR DEFINED ENABLE_NIC_EXEC)
message(STATUS "For CMake builds, NIC executor also requires explicit opt-in by setting CMake flag -DENABLE_NIC_EXEC=1 or environment flag ENABLE_NIC_EXEC=1")
find_library(IBVERBS_LIBRARY ibverbs)
find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
if(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
add_library(ibverbs SHARED IMPORTED)
set_target_properties(ibverbs PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" IMPORTED_LOCATION "${IBVERBS_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}")
set(IBVERBS_FOUND 1)
message(STATUS "Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
else()
if(NOT IBVERBS_LIBRARY)
message(WARNING "IBVerbs library not found")
elseif(NOT IBVERBS_INCLUDE_DIR)
message(WARNING "infiniband/verbs.h not found")
endif()
message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
endif() endif()
message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed") else()
message(STATUS "Disabling NIC Executor support")
message(WARNING "For CMake builds, NIC executor requires explicit opt-in by setting ENABLE_NIC_EXEC=1")
endif() endif()
link_libraries(numa hsa-runtime64 pthread) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp) add_executable(TransferBench src/client/Client.cpp)
target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)
find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH}) target_include_directories(TransferBench PRIVATE src/header)
include(ROCMInstallTargets) target_include_directories(TransferBench PRIVATE src/client)
include(ROCMCreatePackage) target_include_directories(TransferBench PRIVATE src/client/Presets)
set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF) target_include_directories(TransferBench PRIVATE ${NUMA_INCLUDE_DIR})
target_include_directories(TransferBench PRIVATE ${HSA_INCLUDE_DIR})
if(IBVERBS_FOUND)
target_include_directories(TransferBench PRIVATE ${IBVERBS_INCLUDE_DIR})
target_link_libraries(TransferBench PRIVATE ${IBVERBS_LIBRARY})
target_compile_definitions(TransferBench PRIVATE NIC_EXEC_ENABLED)
endif()
set(PACKAGE_NAME TB) target_link_libraries(TransferBench PRIVATE -fgpu-rdc) # Required when linking relocatable device code
set(LIBRARY_NAME TransferBench) target_link_libraries(TransferBench PRIVATE Threads::Threads)
target_link_libraries(TransferBench INTERFACE hip::host)
target_link_libraries(TransferBench PRIVATE hip::device)
target_link_libraries(TransferBench PRIVATE dl)
target_link_libraries(TransferBench PRIVATE ${NUMA_LIBRARY})
target_link_libraries(TransferBench PRIVATE ${HSA_LIBRARY})
rocm_install(TARGETS TransferBench COMPONENT devel) rocm_install(TARGETS TransferBench COMPONENT devel)
rocm_setup_version(VERSION ${VERSION_STRING})
rocm_package_add_dependencies(DEPENDS numactl hsa-rocr) # Package specific CPACK vars
rocm_package_add_dependencies(DEPENDS "numactl" "hsa-rocr")
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
set(PACKAGE_NAME TB)
set(LIBRARY_NAME TransferBench)
rocm_create_package( rocm_create_package(
NAME ${LIBRARY_NAME} NAME ${LIBRARY_NAME}
DESCRIPTION "TransferBench package" DESCRIPTION "TransferBench package"
MAINTAINER "RCCL Team <gilbert.lee@amd.com>" MAINTAINER "RCCL Team <gilbert.lee@amd.com>"
) )
...@@ -6,57 +6,81 @@ ...@@ -6,57 +6,81 @@
ROCM_PATH ?= /opt/rocm ROCM_PATH ?= /opt/rocm
CUDA_PATH ?= /usr/local/cuda CUDA_PATH ?= /usr/local/cuda
HIPCC=$(ROCM_PATH)/bin/hipcc HIPCC ?= $(ROCM_PATH)/bin/amdclang++
NVCC=$(CUDA_PATH)/bin/nvcc NVCC ?= $(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected # This can be a space separated string of multiple GPU targets
ifeq ("$(shell test -e $(NVCC) && echo found)", "found") # Default is the native GPU target
EXE=TransferBenchCuda GPU_TARGETS ?= native
CXX=$(NVCC)
else DEBUG ?= 0
EXE=TransferBench
CXX=$(HIPCC) ifeq ($(filter clean,$(MAKECMDGOALS)),)
endif # Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda
CXX=$(NVCC)
else
EXE=TransferBench
ifeq ("$(shell test -e $(HIPCC) && echo found)", "found")
CXX=$(HIPCC)
else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
CXX=$(ROCM_PATH)/bin/hipcc
$(warning "Could not find $(HIPCC). Using fallback to $(CXX)")
else
$(error "Could not find $(HIPCC) or $(ROCM_PATH)/bin/hipcc. Check if the path is correct if you want to build $(EXE)")
endif
GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 CXXFLAGS = -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
NVFLAGS = -x cu -lnuma -arch=native HIPLDFLAGS= -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets HIPFLAGS = -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
LDFLAGS += -lpthread NVFLAGS = -x cu -lnuma -arch=native
# Compile RDMA executor if ifeq ($(DEBUG), 0)
# 1) DISABLE_NIC_EXEC is not set to 1 COMMON_FLAGS += -O3
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
NIC_ENABLED = 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
$(info lib IBVerbs not found)
else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
$(info infiniband/verbs.h not found)
else else
LDFLAGS += -libverbs -DNIC_EXEC_ENABLED COMMON_FLAGS += -O0 -g -ggdb3
NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
NIC_ENABLED = 1
endif endif
ifeq ($(NIC_ENABLED), 0) COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets
$(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
LDFLAGS += -lpthread
# Compile RDMA executor if
# 1) DISABLE_NIC_EXEC is not set to 1
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
DISABLE_NIC_EXEC ?= 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
$(info lib IBVerbs not found)
else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
$(info infiniband/verbs.h not found)
else
CXXFLAGS += -DNIC_EXEC_ENABLED
LDFLAGS += -libverbs
NIC_ENABLED = 1
endif
ifeq ($(NIC_ENABLED), 0)
$(info Building without NIC executor support)
$(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
else
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
endif
endif endif
endif endif
.PHONY : all clean
all: $(EXE) all: $(EXE)
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS) $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) $(COMMON_FLAGS) $< -o $@ $(HIPLDFLAGS) $(LDFLAGS)
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS) $(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
clean: clean:
rm -f *.o ./TransferBench ./TransferBenchCuda rm -f ./TransferBench ./TransferBenchCuda
NicStatus:
ifeq ($(NIC_ENABLED), 1)
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
else
$(info Building without NIC executor support)
endif
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# Test dependencies
include(FetchContent)
set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "")
# Find or download/install rocm-cmake project
find_package(ROCmCMakeBuildTools 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}")
if((NOT ROCmCMakeBuildTools_FOUND) OR INSTALL_DEPENDENCIES)
message(STATUS "ROCmCMakeBuildTools not found. Checking for ROCM (deprecated)")
find_package(ROCM 0.7.3 CONFIG QUIET PATHS "${ROCM_PATH}") # deprecated fallback
if((NOT ROCM_FOUND) OR INSTALL_DEPENDENCIES)
message(STATUS "ROCM (deprecated) not found. Downloading and building ROCmCMakeBuildTools")
set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
set(rocm_cmake_tag "rocm-6.4.0" CACHE STRING "rocm-cmake tag to download")
FetchContent_Declare(
rocm-cmake
GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git
GIT_TAG ${rocm_cmake_tag}
SOURCE_SUBDIR "DISABLE ADDING TO BUILD"
)
FetchContent_MakeAvailable(rocm-cmake)
message(STATUS "rocm-cmake_SOURCE_DIR: ${rocm-cmake_SOURCE_DIR}")
find_package(ROCmCMakeBuildTools CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
endif()
elseif(ROCmCMakeBuildTools_FOUND)
message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
endif()
# Find available local ROCM targets
# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
function(rocm_local_targets VARIABLE)
set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
execute_process(
COMMAND "${_rocm_agent_enumerator}"
RESULT_VARIABLE _found_agents
OUTPUT_VARIABLE _rocm_agents
ERROR_QUIET
)
if (_found_agents EQUAL 0)
string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
unset(result)
foreach (agent IN LISTS _rocm_agents)
if (NOT agent STREQUAL "gfx000")
list(APPEND result "${agent}")
endif()
endforeach()
if(result)
list(REMOVE_DUPLICATES result)
set(${VARIABLE} "${result}" PARENT_SCOPE)
endif()
endif()
endif()
endfunction()
include(ROCMSetupVersion)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
include(ROCMInstallSymlinks)
include(ROCMCheckTargetIds)
include(ROCMClients)
include(ROCMHeaderWrapper)
...@@ -131,9 +131,8 @@ public: ...@@ -131,9 +131,8 @@ public:
int defaultGfxUnroll = 4; int defaultGfxUnroll = 4;
if (archName == "gfx906") defaultGfxUnroll = 8; if (archName == "gfx906") defaultGfxUnroll = 8;
else if (archName == "gfx90a") defaultGfxUnroll = 8; else if (archName == "gfx90a") defaultGfxUnroll = 8;
else if (archName == "gfx940") defaultGfxUnroll = 6;
else if (archName == "gfx941") defaultGfxUnroll = 6;
else if (archName == "gfx942") defaultGfxUnroll = 4; else if (archName == "gfx942") defaultGfxUnroll = 4;
else if (archName == "gfx950") defaultGfxUnroll = 4;
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0); alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256); blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
......
/* /*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -20,57 +20,150 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -20,57 +20,150 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
*/ */
enum {
HBM_READ = 0,
HBM_WRITE = 1,
HBM_COPY = 2,
HBM_ADD = 3,
NUM_HBM_TESTS = 4
} HbmTests;
void HealthCheckPreset(EnvVars& ev, struct HbmTestConfig
size_t const numBytesPerTransfer,
std::string const presetName)
{ {
// Check for supported platforms std::string name;
#if defined(__NVCC__) int numInputs;
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n"); int numOutputs;
return; };
#endif
HbmTestConfig HbmTestConfigs[NUM_HBM_TESTS] =
bool hasFail = false; { {"READ", 1, 0},
{"WRITE", 0, 1},
// Force use of single stream {"COPY", 1, 1},
ev.useSingleStream = 1; {"ADD", 2, 1}
};
typedef struct
{
double unidirHostToDeviceCopyLimit;
double unidirDeviceToHostCopyLimit;
double bidirDmaCopyLimit;
int a2aUnrollFactor;
int a2aNumSubExecs;
double a2aCopyLimit;
int hbmBlockSize [NUM_HBM_TESTS];
int hbmUnrollFactor[NUM_HBM_TESTS];
int hbmTemporalMode[NUM_HBM_TESTS];
double hbmLimit [NUM_HBM_TESTS];
} TestConfig;
typedef enum
{
MODEL_08_GFX0942_304 = 0,
MODEL_08_GFX0942_064 = 1,
NUM_SUPPORTED_MODELS = 2
} ModelEnum;
// All limits are scaled by this factor
#define SFACTOR 0.97
TestConfig Config_08_GFX0942_304 = {
.unidirHostToDeviceCopyLimit = 50,
.unidirDeviceToHostCopyLimit = 50,
.bidirDmaCopyLimit = 90,
.a2aUnrollFactor = 2,
.a2aNumSubExecs = 8,
.a2aCopyLimit = 45,
.hbmBlockSize = { 384, 256, 320, 256},
.hbmUnrollFactor = { 7, 4, 8, 7},
.hbmTemporalMode = { 3, 3, 3, 3},
.hbmLimit = {4980, 4850, 2045, 1405},
};
TestConfig Config_08_GFX0942_064 = {
.unidirHostToDeviceCopyLimit = 50,
.unidirDeviceToHostCopyLimit = 50,
.bidirDmaCopyLimit = 90,
.a2aUnrollFactor = 2,
.a2aNumSubExecs = 8,
.a2aCopyLimit = 45,
.hbmBlockSize = { 448, 448, 448, 384},
.hbmUnrollFactor = { 8, 3, 8, 7},
.hbmTemporalMode = { 3, 3, 3, 3},
.hbmLimit = {4180, 2800, 1400, 1055},
};
TestConfig TestConfigs[NUM_SUPPORTED_MODELS] =
{
Config_08_GFX0942_304,
Config_08_GFX0942_064,
};
TransferBench::TestResults results; int DetectModel()
{
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX); int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numGpuDevices != 8) { std::string archName = "";
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n"); int numSubExecutors = 0;
exit(1);
}
// Loop over all GPUs and determine if they are identical
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) { for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
// Check that arch name is identical
hipDeviceProp_t prop; hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, gpuId)); HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
std::string fullName = prop.gcnArchName; std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':')); std::string currArchName = fullName.substr(0, fullName.find(':'));
if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942")) if (archName != "" && archName != currArchName) {
{ printf("[WARN] healthcheck preset is currently only supported when all GPUs are identical\n");
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n"); printf(" Detected both %s and %s\n", archName.c_str(), currArchName.c_str());
exit(1); exit(1);
} }
archName = currArchName;
// Check number of subexecutors
int currNumSubExecutors = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
if (numSubExecutors != 0 && numSubExecutors != currNumSubExecutors) {
printf("[WARN] healthcheck preset is currently only supported when all GPUs are identical\n");
printf(" Detected different subexecutor counts: %d and %d\n", numSubExecutors, currNumSubExecutors);
exit(1);
}
numSubExecutors = currNumSubExecutors;
} }
// Pass limits // Classify based on detected configuration
double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95); if (numGpuDevices == 8) {
double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95); if (archName == "gfx942") {
double a2aLimit = getenv("LIMIT_A2A") ? atof(getenv("LIMIT_A2A")) : (int)(45 * 0.95); switch (numSubExecutors) {
case 304: return MODEL_08_GFX0942_304;
case 64: return MODEL_08_GFX0942_064;
}
}
}
// Run CPU to GPU printf("[WARN] healthcheck preset is currently not supported on this hardware\n");
printf(" Detected %d x [%s] with [%d] subexecutors per GPU\n", numGpuDevices, archName.c_str(), numSubExecutors);
exit(1);
}
// Run unidirectional read from CPU to GPU int TestUnidir(int modelId, bool verbose)
printf("Testing unidirectional reads from CPU "); {
TestConfig const& testConfig = TestConfigs[modelId];
TransferBench::ConfigOptions cfg;
TransferBench::TestResults results;
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
cfg.dma.useHsaCopy = 1;
// Run unidirectional host to device copy
printf("Testing unidirectional host to device copy%c", verbose ? '\n' : ' ');
{ {
ev.gfxUnroll = 4; double limit = testConfig.unidirHostToDeviceCopyLimit * SFACTOR;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails; std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) { for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout); if (!verbose) printf(".");
fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId); int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) { if (memIndex == -1) {
...@@ -79,49 +172,44 @@ void HealthCheckPreset(EnvVars& ev, ...@@ -79,49 +172,44 @@ void HealthCheckPreset(EnvVars& ev,
} }
std::vector<Transfer> transfers(1); std::vector<Transfer> transfers(1);
Transfer& t = transfers[0]; Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId}; t.exeDevice = {EXE_GPU_DMA, gpuId};
t.numBytes = 64*1024*1024; t.numBytes = 256*1024*1024;
t.srcs = {{MEM_CPU, memIndex}}; t.srcs = {{MEM_CPU, memIndex}};
t.dsts = {}; t.dsts = {{MEM_GPU, gpuId}};
t.numSubExecs = 1;
// Loop over number of CUs to use
bool passed = false; if (TransferBench::RunTransfers(cfg, transfers, results)) {
double bestResult = 0; double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
for (int cu = 7; cu <= 10; cu++) { if (measuredBw < limit) {
t.numSubExecs = cu; fails.push_back(std::make_pair(gpuId, measuredBw));
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
} }
if (verbose) printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
} else {
PrintErrors(results.errResults);
} }
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
} }
if (fails.size() == 0) { if (fails.size() == 0) {
printf("PASS\n"); printf("PASS\n");
} else { } else {
hasFail = true; hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size()); printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) { for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit); printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, limit);
} }
} }
} }
// Run unidirectional write from GPU to CPU // Run unidirectional device to host copy
printf("Testing unidirectional writes to CPU "); printf("Testing unidirectional device to host copy%c", verbose ? '\n' : ' ');
{ {
ev.gfxUnroll = 4; double limit = testConfig.unidirDeviceToHostCopyLimit * SFACTOR;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails; std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) { for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout); if (!verbose) printf(".");
fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId); int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) { if (memIndex == -1) {
...@@ -130,49 +218,54 @@ void HealthCheckPreset(EnvVars& ev, ...@@ -130,49 +218,54 @@ void HealthCheckPreset(EnvVars& ev,
} }
std::vector<Transfer> transfers(1); std::vector<Transfer> transfers(1);
Transfer& t = transfers[0]; Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId}; t.exeDevice = {EXE_GPU_DMA, gpuId};
t.numBytes = 64*1024*1024; t.numBytes = 256*1024*1024;
t.srcs = {}; t.srcs = {{MEM_GPU, gpuId}};
t.dsts = {{MEM_CPU, memIndex}}; t.dsts = {{MEM_CPU, memIndex}};
t.numSubExecs = 1;
// Loop over number of CUs to use
bool passed = false; if (TransferBench::RunTransfers(cfg, transfers, results)) {
double bestResult = 0; double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
for (int cu = 7; cu <= 10; cu++) { if (measuredBw < limit) {
t.numSubExecs = cu; fails.push_back(std::make_pair(gpuId, measuredBw));
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
} }
if (verbose) printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
} else {
PrintErrors(results.errResults);
} }
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
} }
if (fails.size() == 0) { if (fails.size() == 0) {
printf("PASS\n"); printf("PASS\n");
} else { } else {
hasFail = true; hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size()); printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) { for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit); printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, limit);
} }
} }
} }
return hasFail;
}
// Run bidirectional tests int TestBidir(int modelId, bool verbose)
printf("Testing bidirectional reads + writes "); {
TestConfig const& testConfig = TestConfigs[modelId];
TransferBench::ConfigOptions cfg;
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
printf("Testing bidirectional host<->device copies%c", verbose ? '\n' : ' ');
{ {
ev.gfxUnroll = 4; double limit = testConfig.bidirDmaCopyLimit * SFACTOR;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails; std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) { for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout); if (!verbose) printf(".");
fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId); int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) { if (memIndex == -1) {
...@@ -184,62 +277,65 @@ void HealthCheckPreset(EnvVars& ev, ...@@ -184,62 +277,65 @@ void HealthCheckPreset(EnvVars& ev,
Transfer& t0 = transfers[0]; Transfer& t0 = transfers[0];
Transfer& t1 = transfers[1]; Transfer& t1 = transfers[1];
t0.exeDevice = {EXE_GPU_GFX, gpuId}; t0.exeDevice = {EXE_GPU_DMA, gpuId};
t0.numBytes = 64*1024*1024; t0.numBytes = 256*1024*1024;
t0.srcs = {{MEM_CPU, memIndex}}; t0.srcs = {{MEM_GPU, gpuId}};
t0.dsts = {}; t0.dsts = {{MEM_CPU, memIndex}};
t0.numSubExecs = 1;
t1.exeDevice = {EXE_GPU_GFX, gpuId};
t1.numBytes = 64*1024*1024; t1.exeDevice = {EXE_GPU_DMA, gpuId};
t1.srcs = {}; t1.numBytes = 256*1024*1024;
t1.dsts = {{MEM_CPU, memIndex}}; t1.srcs = {{MEM_CPU, memIndex}};
t1.dsts = {{MEM_GPU, gpuId}};
// Loop over number of CUs to use t1.numSubExecs = 1;
bool passed = false;
double bestResult = 0; TransferBench::TestResults results;
for (int cu = 7; cu <= 10; cu++) { if (TransferBench::RunTransfers(cfg, transfers, results)) {
t0.numSubExecs = cu; double measuredBw = (results.tfrResults[0].avgBandwidthGbPerSec +
t1.numSubExecs = cu; results.tfrResults[1].avgBandwidthGbPerSec);
if (measuredBw < limit) {
if (TransferBench::RunTransfers(cfg, transfers, results)) { fails.push_back(std::make_pair(gpuId, measuredBw));
double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
results.tfrResults[1].avgBandwidthGbPerSec);
bestResult = std::max(bestResult, sum);
if (sum >= bdirLimit) {
passed = true;
break;
}
} else {
PrintErrors(results.errResults);
} }
if (verbose) printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
} else {
PrintErrors(results.errResults);
} }
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
} }
if (fails.size() == 0) { if (fails.size() == 0) {
printf("PASS\n"); printf("PASS\n");
} else { } else {
hasFail = true; hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size()); printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) { for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit); printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, limit);
} }
} }
} }
return hasFail;
}
// Run XGMI tests: int TestAllToAll(int modelId, bool verbose)
printf("Testing all-to-all XGMI copies "); fflush(stdout); {
TestConfig const& testConfig = TestConfigs[modelId];
TransferBench::ConfigOptions cfg;
cfg.gfx.unrollFactor = testConfig.a2aUnrollFactor;
int numSubExecs = testConfig.a2aNumSubExecs;
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
printf("Testing all-to-all XGMI copies %c", verbose ? '\n' : ' '); fflush(stdout);
{ {
// Force GFX unroll to 2 for MI300 double limit = testConfig.a2aCopyLimit * SFACTOR;
ev.gfxUnroll = 2;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<Transfer> transfers; std::vector<Transfer> transfers;
for (int i = 0; i < numGpuDevices; i++) { for (int i = 0; i < numGpuDevices; i++) {
for (int j = 0; j < numGpuDevices; j++) { for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue; if (i == j) continue;
Transfer t; Transfer t;
t.numBytes = 64*1024*1024; t.numBytes = 256*1024*1024;
t.numSubExecs = 8; t.numSubExecs = numSubExecs;
t.exeDevice = {EXE_GPU_GFX, i}; t.exeDevice = {EXE_GPU_GFX, i};
t.srcs = {{MEM_GPU_FINE, i}}; t.srcs = {{MEM_GPU_FINE, i}};
t.dsts = {{MEM_GPU_FINE, j}}; t.dsts = {{MEM_GPU_FINE, j}};
...@@ -247,17 +343,18 @@ void HealthCheckPreset(EnvVars& ev, ...@@ -247,17 +343,18 @@ void HealthCheckPreset(EnvVars& ev,
} }
} }
std::vector<std::pair<std::pair<int,int>, double>> fails; std::vector<std::pair<std::pair<int,int>, double>> fails;
TransferBench::TestResults results;
if (TransferBench::RunTransfers(cfg, transfers, results)) { if (TransferBench::RunTransfers(cfg, transfers, results)) {
int transferIdx = 0; int transferIdx = 0;
for (int i = 0; i < numGpuDevices; i++) { for (int i = 0; i < numGpuDevices; i++) {
printf("."); fflush(stdout); if (!verbose) printf("."); fflush(stdout);
for (int j = 0; j < numGpuDevices; j++) { for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue; if (i == j) continue;
double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec; double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
if (bw < a2aLimit) { if (bw < limit) {
fails.push_back(std::make_pair(std::make_pair(i,j), bw)); fails.push_back(std::make_pair(std::make_pair(i,j), bw));
} }
if (verbose) printf(" GPU %02d to GPU %02d: : Measured %6.2f Limit %6.2f\n", i, j, bw, limit);
transferIdx++; transferIdx++;
} }
} }
...@@ -265,12 +362,111 @@ void HealthCheckPreset(EnvVars& ev, ...@@ -265,12 +362,111 @@ void HealthCheckPreset(EnvVars& ev,
if (fails.size() == 0) { if (fails.size() == 0) {
printf("PASS\n"); printf("PASS\n");
} else { } else {
hasFail = true; hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, limit);
}
}
}
return hasFail;
}
int TestHbmPerformance(int modelId, bool verbose)
{
TestConfig const& testConfig = TestConfigs[modelId];
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
char testname[50];
for (int testId = 0; testId < NUM_HBM_TESTS; testId++) {
TransferBench::ConfigOptions cfg;
cfg.general.numIterations = 1000;
cfg.general.numWarmups = 50;
cfg.gfx.blockSize = testConfig.hbmBlockSize[testId];
cfg.gfx.unrollFactor = testConfig.hbmUnrollFactor[testId];
cfg.gfx.temporalMode = testConfig.hbmTemporalMode[testId];
sprintf(testname, "Testing HBM performance [%s]", HbmTestConfigs[testId].name.c_str());
if (verbose) printf("[Blocksize: %d Unroll: %d TemporalMode: %d]\n", cfg.gfx.blockSize, cfg.gfx.unrollFactor, cfg.gfx.temporalMode);
printf("%-42s%c", testname, verbose ? '\n' : ' ');
fflush(stdout);
int numInputs = HbmTestConfigs[testId].numInputs;
int numOutputs = HbmTestConfigs[testId].numOutputs;
double limit = testConfig.hbmLimit[testId] * SFACTOR;
std::vector<std::pair<int, double>> fails;
TransferBench::TestResults results;
std::vector<Transfer> transfers;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
Transfer t;
t.numSubExecs = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
t.numBytes = 16777216ULL * t.numSubExecs;
t.exeDevice = {EXE_GPU_GFX, gpuId};
for (int i = 0; i < numInputs; i++) t.srcs.push_back({MEM_GPU, gpuId});
for (int i = 0; i < numOutputs; i++) t.dsts.push_back({MEM_GPU, gpuId});
transfers.push_back(t);
}
if (TransferBench::RunTransfers(cfg, transfers, results)) {
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
if (!verbose) printf(".");
fflush(stdout);
double measuredBw = results.tfrResults[gpuId].avgBandwidthGbPerSec;
if (measuredBw < limit) {
fails.push_back(std::make_pair(gpuId, measuredBw));
}
if (verbose) printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
}
} else {
PrintErrors(results.errResults);
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size()); printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) { for (auto p : fails) {
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit); printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, limit);
} }
} }
} }
exit(hasFail ? 1 : 0); return hasFail;
}
void HealthCheckPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
// Check for supported platforms
#if defined(__NVCC__)
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
return;
#endif
printf("Disclaimer:\n");
printf("==================================================================\n");
printf("NOTE: This is an experimental feature and may be subject to change\n");
printf(" Failures do not necessarily indicate hardware issues, as other factors\n");
printf(" such as simultaneous workloads may influence results\n");
printf("\n");
// Collect custom env vars for this preset
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
// Determine if this is a supported model
int modelId = DetectModel();
// Run through all tests
int numFails = 0;
numFails += TestHbmPerformance(modelId, verbose);
numFails += TestUnidir(modelId, verbose);
numFails += TestBidir(modelId, verbose);
numFails += TestAllToAll(modelId, verbose);
exit(numFails ? 1 : 0);
} }
...@@ -66,7 +66,7 @@ namespace TransferBench ...@@ -66,7 +66,7 @@ namespace TransferBench
using std::set; using std::set;
using std::vector; using std::vector;
constexpr char VERSION[] = "1.62"; constexpr char VERSION[] = "1.63";
/** /**
* Enumeration of supported Executor types * Enumeration of supported Executor types
...@@ -516,7 +516,7 @@ namespace TransferBench ...@@ -516,7 +516,7 @@ namespace TransferBench
//========================================================================================== //==========================================================================================
// Macro for collecting CU/SM GFX kernel is running on // Macro for collecting CU/SM GFX kernel is running on
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__) #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__)
#define GetHwId(hwId) hwId = 0 #define GetHwId(hwId) hwId = 0
#elif defined(__NVCC__) #elif defined(__NVCC__)
#define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId)) #define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
...@@ -525,7 +525,7 @@ namespace TransferBench ...@@ -525,7 +525,7 @@ namespace TransferBench
#endif #endif
// Macro for collecting XCC GFX kernel is running on // Macro for collecting XCC GFX kernel is running on
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) #if defined(__gfx942__) || defined(__gfx950__)
#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val)); #define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
#else #else
#define GetXccId(val) val = 0 #define GetXccId(val) val = 0
...@@ -755,7 +755,7 @@ namespace { ...@@ -755,7 +755,7 @@ namespace {
#if defined (__NVCC__) #if defined (__NVCC__)
return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"}; return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"};
#else #else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser)); ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocCoherent));
#endif #endif
} else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) { } else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
#if defined (__NVCC__) #if defined (__NVCC__)
...@@ -895,6 +895,8 @@ namespace { ...@@ -895,6 +895,8 @@ namespace {
// Get the hsa_agent_t associated with a MemDevice // Get the hsa_agent_t associated with a MemDevice
static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent) static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
{ {
if (memDevice.memType == MEM_CPU_CLOSEST)
return GetHsaAgent({EXE_CPU, GetClosestCpuNumaToGpu(memDevice.memIndex)}, agent);
if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent); if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent); if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
return {ERR_FATAL, return {ERR_FATAL,
...@@ -1191,17 +1193,20 @@ namespace { ...@@ -1191,17 +1193,20 @@ namespace {
if (err.errType == ERR_FATAL) break; if (err.errType == ERR_FATAL) break;
} }
uint32_t engineIdMask = 0; // Skip check of engine Id mask for self copies
err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask); if (srcAgent.handle != dstAgent.handle) {
if (err.errType != ERR_NONE) { uint32_t engineIdMask = 0;
errors.push_back(err); err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
if (err.errType == ERR_FATAL) break; if (err.errType != ERR_NONE) {
} errors.push_back(err);
hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex); if (err.errType == ERR_FATAL) break;
if (!(sdmaEngineId & engineIdMask)) { }
errors.push_back({ERR_FATAL, hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
"Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst", if (!(sdmaEngineId & engineIdMask)) {
i, t.exeDevice.exeIndex, t.exeSubIndex}); errors.push_back({ERR_FATAL,
"Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
i, t.exeDevice.exeIndex, t.exeSubIndex});
}
} }
#endif #endif
} }
...@@ -2624,7 +2629,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2624,7 +2629,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
childThreads.clear(); childThreads.clear();
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0) / cfg.general.numSubIterations;
if (iteration >= 0) { if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
...@@ -2654,7 +2659,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2654,7 +2659,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
asyncTransfer.join(); asyncTransfer.join();
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec; exeInfo.totalDurationMsec += deltaMsec;
return ERR_NONE; return ERR_NONE;
...@@ -2692,20 +2698,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2692,20 +2698,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (numaNode != -1) if (numaNode != -1)
numa_run_on_node(numaNode); numa_run_on_node(numaNode);
} }
auto transferCount = exeInfo.resources.size();
std::vector<double> totalTimeMsec(transferCount, 0.0);
int subIterations = 0; int subIterations = 0;
auto cpuStart = std::chrono::high_resolution_clock::now();
std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
do { do {
auto cpuStart = std::chrono::high_resolution_clock::now(); std::vector<uint8_t> receivedQPs(transferCount, 0);
size_t completedTransfers = 0;
auto transferCount = exeInfo.resources.size();
std::vector<uint8_t> receivedQPs(transferCount);
std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
// post the sends // post the sends
for (auto i = 0; i < transferCount; i++) { for (auto i = 0; i < transferCount; i++) {
transferTimers[i] = std::chrono::high_resolution_clock::now(); transferTimers[i] = std::chrono::high_resolution_clock::now();
ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i])); ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
} }
// poll for completions // poll for completions
do { size_t completedTransfers = 0;
while (completedTransfers < transferCount) {
for (auto i = 0; i < transferCount; i++) { for (auto i = 0; i < transferCount; i++) {
if(receivedQPs[i] < exeInfo.resources[i].qpCount) { if(receivedQPs[i] < exeInfo.resources[i].qpCount) {
auto& rss = exeInfo.resources[i]; auto& rss = exeInfo.resources[i];
...@@ -2725,20 +2735,28 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2725,20 +2735,28 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i]; auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i];
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) { if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec; totalTimeMsec[i] += deltaMsec;
if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(deltaMsec);
} }
completedTransfers++; completedTransfers++;
} }
} }
} }
} while(completedTransfers < transferCount); }
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec;
} while(++subIterations < cfg.general.numSubIterations); } while(++subIterations < cfg.general.numSubIterations);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) {
exeInfo.totalDurationMsec += deltaMsec;
for (int i = 0; i < transferCount; i++) {
auto& rss = exeInfo.resources[i];
double transferTimeMsec = totalTimeMsec[i] / cfg.general.numSubIterations;
rss.totalDurationMsec += transferTimeMsec;
if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(transferTimeMsec);
}
}
return ERR_NONE; return ERR_NONE;
} }
#endif #endif
...@@ -3077,14 +3095,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3077,14 +3095,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_CHECK(hipStreamSynchronize(stream)); ERR_CHECK(hipStreamSynchronize(stream));
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) { if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec; double deltaMsec = cpuDeltaMsec;
if (startEvent != NULL) { if (startEvent != NULL) {
float gpuDeltaMsec; float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec; deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
} }
rss.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
...@@ -3154,12 +3172,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3154,12 +3172,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_CHECK(hipStreamSynchronize(stream)); ERR_CHECK(hipStreamSynchronize(stream));
} }
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0
/ cfg.general.numSubIterations;
if (iteration >= 0) { if (iteration >= 0) {
if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) { if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
float gpuDeltaMsec; float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0])); ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
gpuDeltaMsec /= cfg.general.numSubIterations;
exeInfo.totalDurationMsec += gpuDeltaMsec; exeInfo.totalDurationMsec += gpuDeltaMsec;
} else { } else {
exeInfo.totalDurationMsec += cpuDeltaMsec; exeInfo.totalDurationMsec += cpuDeltaMsec;
...@@ -3182,7 +3202,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3182,7 +3202,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
} }
double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate); double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
deltaMsec /= cfg.general.numSubIterations;
rss.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
rss.perIterMsec.push_back(deltaMsec); rss.perIterMsec.push_back(deltaMsec);
...@@ -3249,14 +3269,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3249,14 +3269,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#endif #endif
} }
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) { if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec; double deltaMsec = cpuDeltaMsec;
if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) { if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
float gpuDeltaMsec; float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec; deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
} }
resources.totalDurationMsec += deltaMsec; resources.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) if (cfg.general.recordPerIteration)
...@@ -3291,7 +3311,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3291,7 +3311,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_CHECK(asyncTransfer.get()); ERR_CHECK(asyncTransfer.get());
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec; exeInfo.totalDurationMsec += deltaMsec;
return ERR_NONE; return ERR_NONE;
...@@ -3493,7 +3513,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3493,7 +3513,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Stop CPU timing for this iteration // Stop CPU timing for this iteration
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count(); double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() / cfg.general.numSubIterations;
if (cfg.data.alwaysValidate) { if (cfg.data.alwaysValidate) {
ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer), ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
...@@ -3528,7 +3548,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3528,7 +3548,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
results.tfrResults.resize(transfers.size()); results.tfrResults.resize(transfers.size());
results.numTimedIterations = numTimedIterations; results.numTimedIterations = numTimedIterations;
results.totalBytesTransferred = 0; results.totalBytesTransferred = 0;
results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / (numTimedIterations * cfg.general.numSubIterations); results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / numTimedIterations;
results.overheadMsec = results.avgTotalDurationMsec; results.overheadMsec = results.avgTotalDurationMsec;
for (auto& exeInfoPair : executorMap) { for (auto& exeInfoPair : executorMap) {
ExeDevice const& exeDevice = exeInfoPair.first; ExeDevice const& exeDevice = exeInfoPair.first;
...@@ -3537,7 +3557,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3537,7 +3557,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Copy over executor results // Copy over executor results
ExeResult& exeResult = results.exeResults[exeDevice]; ExeResult& exeResult = results.exeResults[exeDevice];
exeResult.numBytes = exeInfo.totalBytes; exeResult.numBytes = exeInfo.totalBytes;
exeResult.avgDurationMsec = exeInfo.totalDurationMsec / (numTimedIterations * cfg.general.numSubIterations); exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations;
exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec; exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec;
exeResult.sumBandwidthGbPerSec = 0.0; exeResult.sumBandwidthGbPerSec = 0.0;
exeResult.transferIdx.clear(); exeResult.transferIdx.clear();
......
if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to the ROCm installation.")
set(rocm_bin "$ENV{ROCM_PATH}/bin")
else()
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
set(rocm_bin "/opt/rocm/bin")
endif()
if (NOT DEFINED ENV{CXX})
if(EXISTS "${rocm_bin}/amdclang++")
set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
else()
if(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++")
set(rocm_bin "${ROCM_PATH}/llvm/bin")
set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++")
set(rocm_bin "${ROCM_PATH}/llvm/bin")
set(CMAKE_CXX_COMPILER "${rocm_bin}/clang++" CACHE PATH "Path to the C++ compiler")
endif()
endif()
else()
set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
endif()
if (NOT DEFINED ENV{CXXFLAGS})
set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
endif()
if(NOT CMAKE_BUILD_TYPE)
message(STATUS "Setting build type to 'Release' as none was specified.")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
endif()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment