Unverified Commit 023ce416 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.63 (#193)



* Fixing issue with P memory type and use of DMA subexecutor
* CMake builds require explicit opt-in by setting NIC_EXEC_ENABLE=1
* Removing self-GPU check for DMA engine copies
* [BUILD] Add new GPU targets and switch to amdclang++ (#187)
* [BUILD] Add gfx950, gfx1150, and gfx1151 targets
* [BUILD] Modify CMake to use amdclang++
* [BUILD] Modify Makefile to use amdclang++
* [GIT] Updated CHANGELOG and .gitignore
* Adding HBM testing to healthcheck preset
* Tweaking HBM tests to occur first, and provide more info during VERBOSE=1
* Fixing timing reporting issues with NUM_SUBITERATIONS
* [BUILD] Simplify Makefile (#190)
* Combines steps for compilation and linking
* Does not rebuild if no change to source code

* Updating CHANGELOG

---------
Co-authored-by: default avatarNilesh M Negi <Nilesh.Negi@amd.com>
parent a4709f4b
...@@ -7,3 +7,4 @@ _templates/ ...@@ -7,3 +7,4 @@ _templates/
_toc.yml _toc.yml
docBin/ docBin/
TransferBench TransferBench
*.o
...@@ -3,6 +3,19 @@ ...@@ -3,6 +3,19 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.63.00
### Added
- Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds
### Modified
- Removing self-GPU check for DMA engine copies
- Switched to amdclang++ as primary compiler
- healthcheck preset adds HBM testing and support for more MI3XX variants
### Fixed
- Fixed issue when using "P" memory type and specific DMA subengines
- Fixed issue with subiteration timing reports
## v1.62.00 ## v1.62.00
### Added ### Added
- Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores - Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores
......
# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
if (DEFINED ENV{ROCM_PATH}) cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
else() # CMake Toolchain file to define compilers and path to ROCm
set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory") #==================================================================================================
if (NOT CMAKE_TOOLCHAIN_FILE)
set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif() endif()
cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.62.00 LANGUAGES CXX) set(VERSION_STRING "1.63.00")
project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
## Load CMake modules
#==================================================================================================
include(CheckIncludeFiles)
include(CheckSymbolExists)
include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
# Build options
#==================================================================================================
option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
option(ENABLE_NIC_EXEC "Enable RDMA NIC Executor in TransferBench" OFF)
# Default GPU architectures to build # Default GPU architectures to build
#================================================================================================== #==================================================================================================
...@@ -16,15 +32,18 @@ set(DEFAULT_GPUS ...@@ -16,15 +32,18 @@ set(DEFAULT_GPUS
gfx908 gfx908
gfx90a gfx90a
gfx942 gfx942
gfx950
gfx1030 gfx1030
gfx1100 gfx1100
gfx1101 gfx1101
gfx1102 gfx1102
gfx1150
gfx1151
gfx1200 gfx1200
gfx1201) gfx1201)
# Build only for local GPU architecture ## Build only for local GPU architecture
if (BUILD_LOCAL_GPU_TARGET_ONLY) if(BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target") message(STATUS "Building only for local GPU target")
if (COMMAND rocm_local_targets) if (COMMAND rocm_local_targets)
rocm_local_targets(DEFAULT_GPUS) rocm_local_targets(DEFAULT_GPUS)
...@@ -33,10 +52,10 @@ if (BUILD_LOCAL_GPU_TARGET_ONLY) ...@@ -33,10 +52,10 @@ if (BUILD_LOCAL_GPU_TARGET_ONLY)
endif() endif()
endif() endif()
# Determine which GPU architectures to build for ## Determine which GPU architectures to build for
set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.") set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
# Check if clang compiler can offload to GPU_TARGETS ## Check if clang compiler can offload to GPU_TARGETS
if (COMMAND rocm_check_target_ids) if (COMMAND rocm_check_target_ids)
message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}") message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS}) rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
...@@ -45,53 +64,127 @@ else() ...@@ -45,53 +64,127 @@ else()
set(SUPPORTED_GPUS ${DEFAULT_GPUS}) set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif() endif()
set(COMPILING_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.") set(GPU_TARGETS "${SUPPORTED_GPUS}")
message(STATUS "Compiling for ${COMPILING_TARGETS}") message(STATUS "Compiling for ${GPU_TARGETS}")
foreach(target ${COMPILING_TARGETS}) ## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
list(APPEND static_link_flags --offload-arch=${target}) include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults
endforeach()
list(JOIN static_link_flags " " flags_str)
set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib") # Check for required dependencies
include_directories(${ROCM_PATH}/include) #==================================================================================================
find_library(IBVERBS_LIBRARY ibverbs) ## Try to establish ROCM_PATH (for find_package)
find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h) if(NOT DEFINED ROCM_PATH)
if (DEFINED ENV{DISABLE_NIC_EXEC}) # Guess default location
message(STATUS "Disabling NIC Executor support") set(ROCM_PATH "/opt/rocm")
elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR) message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs)
else() else()
if (NOT IBVERBS_LIBRARY) message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
message(WARNING "IBVerbs library not found") endif()
elseif (NOT IBVERBS_INCLUDE_DIR) set(ENV{ROCM_PATH} ${ROCM_PATH})
message(WARNING "infiniband/verbs.h not found")
## Set CMAKE flags
if (NOT DEFINED CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
list(APPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths for finding HIP / HSA
${ROCM_PATH}
${ROCM_PATH}/llvm
${ROCM_PATH}/hip
/opt/rocm
/opt/rocm/llvm
/opt/rocm/hip)
## Check for HIP
find_package(hip REQUIRED CONFIG PATHS ${CMAKE_PREFIX_PATH})
message(STATUS "HIP compiler: ${HIP_COMPILER}")
## Ensuring that CXX compiler meets expectations
if(NOT (("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc") OR ("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")))
message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.")
endif()
## Check for Threads
find_package(Threads REQUIRED)
set(THREADS_PREFER_PTHREAD_FLAG ON)
## Check for numa support
find_library(NUMA_LIBRARY numa)
find_path(NUMA_INCLUDE_DIR numa.h)
if(NUMA_LIBRARY AND NUMA_INCLUDE_DIR)
add_library(numa SHARED IMPORTED)
set_target_properties(numa PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" IMPORTED_LOCATION "${NUMA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}")
endif()
## Check for hsa support
find_library(HSA_LIBRARY hsa-runtime64 PATHS ${ROCM_PATH} ${ROCM_PATH}/lib)
find_path(HSA_INCLUDE_DIR hsa.h PATHS ${ROCM_PATH}/include ${ROCM_PATH}/include/hsa)
if(HSA_LIBRARY AND HSA_INCLUDE_DIR)
add_library(hsa-runtime64 SHARED IMPORTED)
set_target_properties(hsa-runtime64 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}" IMPORTED_LOCATION "${HSA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}")
endif()
## Check for infiniband verbs support
if(DEFINED ENV{ENABLE_NIC_EXEC} OR DEFINED ENABLE_NIC_EXEC)
message(STATUS "For CMake builds, NIC executor also requires explicit opt-in by setting CMake flag -DENABLE_NIC_EXEC=1 or environment flag ENABLE_NIC_EXEC=1")
find_library(IBVERBS_LIBRARY ibverbs)
find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
if(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
add_library(ibverbs SHARED IMPORTED)
set_target_properties(ibverbs PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" IMPORTED_LOCATION "${IBVERBS_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}")
set(IBVERBS_FOUND 1)
message(STATUS "Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
else()
if(NOT IBVERBS_LIBRARY)
message(WARNING "IBVerbs library not found")
elseif(NOT IBVERBS_INCLUDE_DIR)
message(WARNING "infiniband/verbs.h not found")
endif()
message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
endif() endif()
message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed") else()
message(STATUS "Disabling NIC Executor support")
message(WARNING "For CMake builds, NIC executor requires explicit opt-in by setting ENABLE_NIC_EXEC=1")
endif() endif()
link_libraries(numa hsa-runtime64 pthread) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp) add_executable(TransferBench src/client/Client.cpp)
target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)
find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH}) target_include_directories(TransferBench PRIVATE src/header)
include(ROCMInstallTargets) target_include_directories(TransferBench PRIVATE src/client)
include(ROCMCreatePackage) target_include_directories(TransferBench PRIVATE src/client/Presets)
set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF) target_include_directories(TransferBench PRIVATE ${NUMA_INCLUDE_DIR})
target_include_directories(TransferBench PRIVATE ${HSA_INCLUDE_DIR})
if(IBVERBS_FOUND)
target_include_directories(TransferBench PRIVATE ${IBVERBS_INCLUDE_DIR})
target_link_libraries(TransferBench PRIVATE ${IBVERBS_LIBRARY})
target_compile_definitions(TransferBench PRIVATE NIC_EXEC_ENABLED)
endif()
set(PACKAGE_NAME TB) target_link_libraries(TransferBench PRIVATE -fgpu-rdc) # Required when linking relocatable device code
set(LIBRARY_NAME TransferBench) target_link_libraries(TransferBench PRIVATE Threads::Threads)
target_link_libraries(TransferBench INTERFACE hip::host)
target_link_libraries(TransferBench PRIVATE hip::device)
target_link_libraries(TransferBench PRIVATE dl)
target_link_libraries(TransferBench PRIVATE ${NUMA_LIBRARY})
target_link_libraries(TransferBench PRIVATE ${HSA_LIBRARY})
rocm_install(TARGETS TransferBench COMPONENT devel) rocm_install(TARGETS TransferBench COMPONENT devel)
rocm_setup_version(VERSION ${VERSION_STRING})
rocm_package_add_dependencies(DEPENDS numactl hsa-rocr) # Package specific CPACK vars
rocm_package_add_dependencies(DEPENDS "numactl" "hsa-rocr")
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
set(PACKAGE_NAME TB)
set(LIBRARY_NAME TransferBench)
rocm_create_package( rocm_create_package(
NAME ${LIBRARY_NAME} NAME ${LIBRARY_NAME}
DESCRIPTION "TransferBench package" DESCRIPTION "TransferBench package"
MAINTAINER "RCCL Team <gilbert.lee@amd.com>" MAINTAINER "RCCL Team <gilbert.lee@amd.com>"
) )
...@@ -6,57 +6,81 @@ ...@@ -6,57 +6,81 @@
ROCM_PATH ?= /opt/rocm ROCM_PATH ?= /opt/rocm
CUDA_PATH ?= /usr/local/cuda CUDA_PATH ?= /usr/local/cuda
HIPCC=$(ROCM_PATH)/bin/hipcc HIPCC ?= $(ROCM_PATH)/bin/amdclang++
NVCC=$(CUDA_PATH)/bin/nvcc NVCC ?= $(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected # This can be a space separated string of multiple GPU targets
ifeq ("$(shell test -e $(NVCC) && echo found)", "found") # Default is the native GPU target
EXE=TransferBenchCuda GPU_TARGETS ?= native
CXX=$(NVCC)
else DEBUG ?= 0
EXE=TransferBench
CXX=$(HIPCC) ifeq ($(filter clean,$(MAKECMDGOALS)),)
endif # Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda
CXX=$(NVCC)
else
EXE=TransferBench
ifeq ("$(shell test -e $(HIPCC) && echo found)", "found")
CXX=$(HIPCC)
else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
CXX=$(ROCM_PATH)/bin/hipcc
$(warning "Could not find $(HIPCC). Using fallback to $(CXX)")
else
$(error "Could not find $(HIPCC) or $(ROCM_PATH)/bin/hipcc. Check if the path is correct if you want to build $(EXE)")
endif
GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 CXXFLAGS = -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
NVFLAGS = -x cu -lnuma -arch=native HIPLDFLAGS= -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets HIPFLAGS = -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
LDFLAGS += -lpthread NVFLAGS = -x cu -lnuma -arch=native
# Compile RDMA executor if ifeq ($(DEBUG), 0)
# 1) DISABLE_NIC_EXEC is not set to 1 COMMON_FLAGS += -O3
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
NIC_ENABLED = 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
$(info lib IBVerbs not found)
else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
$(info infiniband/verbs.h not found)
else else
LDFLAGS += -libverbs -DNIC_EXEC_ENABLED COMMON_FLAGS += -O0 -g -ggdb3
NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
NIC_ENABLED = 1
endif endif
ifeq ($(NIC_ENABLED), 0) COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets
$(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
LDFLAGS += -lpthread
# Compile RDMA executor if
# 1) DISABLE_NIC_EXEC is not set to 1
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
DISABLE_NIC_EXEC ?= 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
$(info lib IBVerbs not found)
else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
$(info infiniband/verbs.h not found)
else
CXXFLAGS += -DNIC_EXEC_ENABLED
LDFLAGS += -libverbs
NIC_ENABLED = 1
endif
ifeq ($(NIC_ENABLED), 0)
$(info Building without NIC executor support)
$(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
else
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
endif
endif endif
endif endif
.PHONY : all clean
all: $(EXE) all: $(EXE)
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS) $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) $(COMMON_FLAGS) $< -o $@ $(HIPLDFLAGS) $(LDFLAGS)
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS) $(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
clean: clean:
rm -f *.o ./TransferBench ./TransferBenchCuda rm -f ./TransferBench ./TransferBenchCuda
NicStatus:
ifeq ($(NIC_ENABLED), 1)
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
else
$(info Building without NIC executor support)
endif
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# Test dependencies
include(FetchContent)
set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "")
# Find or download/install rocm-cmake project
find_package(ROCmCMakeBuildTools 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}")
if((NOT ROCmCMakeBuildTools_FOUND) OR INSTALL_DEPENDENCIES)
message(STATUS "ROCmCMakeBuildTools not found. Checking for ROCM (deprecated)")
find_package(ROCM 0.7.3 CONFIG QUIET PATHS "${ROCM_PATH}") # deprecated fallback
if((NOT ROCM_FOUND) OR INSTALL_DEPENDENCIES)
message(STATUS "ROCM (deprecated) not found. Downloading and building ROCmCMakeBuildTools")
set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
set(rocm_cmake_tag "rocm-6.4.0" CACHE STRING "rocm-cmake tag to download")
FetchContent_Declare(
rocm-cmake
GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git
GIT_TAG ${rocm_cmake_tag}
SOURCE_SUBDIR "DISABLE ADDING TO BUILD"
)
FetchContent_MakeAvailable(rocm-cmake)
message(STATUS "rocm-cmake_SOURCE_DIR: ${rocm-cmake_SOURCE_DIR}")
find_package(ROCmCMakeBuildTools CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
endif()
elseif(ROCmCMakeBuildTools_FOUND)
message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
endif()
# Find available local ROCM targets
# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
function(rocm_local_targets VARIABLE)
set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
execute_process(
COMMAND "${_rocm_agent_enumerator}"
RESULT_VARIABLE _found_agents
OUTPUT_VARIABLE _rocm_agents
ERROR_QUIET
)
if (_found_agents EQUAL 0)
string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
unset(result)
foreach (agent IN LISTS _rocm_agents)
if (NOT agent STREQUAL "gfx000")
list(APPEND result "${agent}")
endif()
endforeach()
if(result)
list(REMOVE_DUPLICATES result)
set(${VARIABLE} "${result}" PARENT_SCOPE)
endif()
endif()
endif()
endfunction()
include(ROCMSetupVersion)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
include(ROCMInstallSymlinks)
include(ROCMCheckTargetIds)
include(ROCMClients)
include(ROCMHeaderWrapper)
...@@ -131,9 +131,8 @@ public: ...@@ -131,9 +131,8 @@ public:
int defaultGfxUnroll = 4; int defaultGfxUnroll = 4;
if (archName == "gfx906") defaultGfxUnroll = 8; if (archName == "gfx906") defaultGfxUnroll = 8;
else if (archName == "gfx90a") defaultGfxUnroll = 8; else if (archName == "gfx90a") defaultGfxUnroll = 8;
else if (archName == "gfx940") defaultGfxUnroll = 6;
else if (archName == "gfx941") defaultGfxUnroll = 6;
else if (archName == "gfx942") defaultGfxUnroll = 4; else if (archName == "gfx942") defaultGfxUnroll = 4;
else if (archName == "gfx950") defaultGfxUnroll = 4;
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0); alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256); blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
......
This diff is collapsed.
...@@ -66,7 +66,7 @@ namespace TransferBench ...@@ -66,7 +66,7 @@ namespace TransferBench
using std::set; using std::set;
using std::vector; using std::vector;
constexpr char VERSION[] = "1.62"; constexpr char VERSION[] = "1.63";
/** /**
* Enumeration of supported Executor types * Enumeration of supported Executor types
...@@ -516,7 +516,7 @@ namespace TransferBench ...@@ -516,7 +516,7 @@ namespace TransferBench
//========================================================================================== //==========================================================================================
// Macro for collecting CU/SM GFX kernel is running on // Macro for collecting CU/SM GFX kernel is running on
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__) #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__)
#define GetHwId(hwId) hwId = 0 #define GetHwId(hwId) hwId = 0
#elif defined(__NVCC__) #elif defined(__NVCC__)
#define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId)) #define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
...@@ -525,7 +525,7 @@ namespace TransferBench ...@@ -525,7 +525,7 @@ namespace TransferBench
#endif #endif
// Macro for collecting XCC GFX kernel is running on // Macro for collecting XCC GFX kernel is running on
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__) #if defined(__gfx942__) || defined(__gfx950__)
#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val)); #define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
#else #else
#define GetXccId(val) val = 0 #define GetXccId(val) val = 0
...@@ -755,7 +755,7 @@ namespace { ...@@ -755,7 +755,7 @@ namespace {
#if defined (__NVCC__) #if defined (__NVCC__)
return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"}; return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"};
#else #else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser)); ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocCoherent));
#endif #endif
} else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) { } else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
#if defined (__NVCC__) #if defined (__NVCC__)
...@@ -895,6 +895,8 @@ namespace { ...@@ -895,6 +895,8 @@ namespace {
// Get the hsa_agent_t associated with a MemDevice // Get the hsa_agent_t associated with a MemDevice
static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent) static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
{ {
if (memDevice.memType == MEM_CPU_CLOSEST)
return GetHsaAgent({EXE_CPU, GetClosestCpuNumaToGpu(memDevice.memIndex)}, agent);
if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent); if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent); if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
return {ERR_FATAL, return {ERR_FATAL,
...@@ -1191,17 +1193,20 @@ namespace { ...@@ -1191,17 +1193,20 @@ namespace {
if (err.errType == ERR_FATAL) break; if (err.errType == ERR_FATAL) break;
} }
uint32_t engineIdMask = 0; // Skip check of engine Id mask for self copies
err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask); if (srcAgent.handle != dstAgent.handle) {
if (err.errType != ERR_NONE) { uint32_t engineIdMask = 0;
errors.push_back(err); err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
if (err.errType == ERR_FATAL) break; if (err.errType != ERR_NONE) {
} errors.push_back(err);
hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex); if (err.errType == ERR_FATAL) break;
if (!(sdmaEngineId & engineIdMask)) { }
errors.push_back({ERR_FATAL, hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
"Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst", if (!(sdmaEngineId & engineIdMask)) {
i, t.exeDevice.exeIndex, t.exeSubIndex}); errors.push_back({ERR_FATAL,
"Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
i, t.exeDevice.exeIndex, t.exeSubIndex});
}
} }
#endif #endif
} }
...@@ -2624,7 +2629,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2624,7 +2629,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
childThreads.clear(); childThreads.clear();
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0) / cfg.general.numSubIterations;
if (iteration >= 0) { if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
...@@ -2654,7 +2659,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2654,7 +2659,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
asyncTransfer.join(); asyncTransfer.join();
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec; exeInfo.totalDurationMsec += deltaMsec;
return ERR_NONE; return ERR_NONE;
...@@ -2692,20 +2698,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2692,20 +2698,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (numaNode != -1) if (numaNode != -1)
numa_run_on_node(numaNode); numa_run_on_node(numaNode);
} }
auto transferCount = exeInfo.resources.size();
std::vector<double> totalTimeMsec(transferCount, 0.0);
int subIterations = 0; int subIterations = 0;
auto cpuStart = std::chrono::high_resolution_clock::now();
std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
do { do {
auto cpuStart = std::chrono::high_resolution_clock::now(); std::vector<uint8_t> receivedQPs(transferCount, 0);
size_t completedTransfers = 0;
auto transferCount = exeInfo.resources.size();
std::vector<uint8_t> receivedQPs(transferCount);
std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
// post the sends // post the sends
for (auto i = 0; i < transferCount; i++) { for (auto i = 0; i < transferCount; i++) {
transferTimers[i] = std::chrono::high_resolution_clock::now(); transferTimers[i] = std::chrono::high_resolution_clock::now();
ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i])); ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
} }
// poll for completions // poll for completions
do { size_t completedTransfers = 0;
while (completedTransfers < transferCount) {
for (auto i = 0; i < transferCount; i++) { for (auto i = 0; i < transferCount; i++) {
if(receivedQPs[i] < exeInfo.resources[i].qpCount) { if(receivedQPs[i] < exeInfo.resources[i].qpCount) {
auto& rss = exeInfo.resources[i]; auto& rss = exeInfo.resources[i];
...@@ -2725,20 +2735,28 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2725,20 +2735,28 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i]; auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i];
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) { if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec; totalTimeMsec[i] += deltaMsec;
if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(deltaMsec);
} }
completedTransfers++; completedTransfers++;
} }
} }
} }
} while(completedTransfers < transferCount); }
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec;
} while(++subIterations < cfg.general.numSubIterations); } while(++subIterations < cfg.general.numSubIterations);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) {
exeInfo.totalDurationMsec += deltaMsec;
for (int i = 0; i < transferCount; i++) {
auto& rss = exeInfo.resources[i];
double transferTimeMsec = totalTimeMsec[i] / cfg.general.numSubIterations;
rss.totalDurationMsec += transferTimeMsec;
if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(transferTimeMsec);
}
}
return ERR_NONE; return ERR_NONE;
} }
#endif #endif
...@@ -3077,14 +3095,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3077,14 +3095,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_CHECK(hipStreamSynchronize(stream)); ERR_CHECK(hipStreamSynchronize(stream));
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) { if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec; double deltaMsec = cpuDeltaMsec;
if (startEvent != NULL) { if (startEvent != NULL) {
float gpuDeltaMsec; float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec; deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
} }
rss.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
...@@ -3154,12 +3172,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3154,12 +3172,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_CHECK(hipStreamSynchronize(stream)); ERR_CHECK(hipStreamSynchronize(stream));
} }
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0
/ cfg.general.numSubIterations;
if (iteration >= 0) { if (iteration >= 0) {
if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) { if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
float gpuDeltaMsec; float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0])); ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
gpuDeltaMsec /= cfg.general.numSubIterations;
exeInfo.totalDurationMsec += gpuDeltaMsec; exeInfo.totalDurationMsec += gpuDeltaMsec;
} else { } else {
exeInfo.totalDurationMsec += cpuDeltaMsec; exeInfo.totalDurationMsec += cpuDeltaMsec;
...@@ -3182,7 +3202,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3182,7 +3202,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
} }
double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate); double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
deltaMsec /= cfg.general.numSubIterations;
rss.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
rss.perIterMsec.push_back(deltaMsec); rss.perIterMsec.push_back(deltaMsec);
...@@ -3249,14 +3269,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3249,14 +3269,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#endif #endif
} }
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) { if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec; double deltaMsec = cpuDeltaMsec;
if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) { if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
float gpuDeltaMsec; float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec; deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
} }
resources.totalDurationMsec += deltaMsec; resources.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) if (cfg.general.recordPerIteration)
...@@ -3291,7 +3311,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3291,7 +3311,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_CHECK(asyncTransfer.get()); ERR_CHECK(asyncTransfer.get());
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec; exeInfo.totalDurationMsec += deltaMsec;
return ERR_NONE; return ERR_NONE;
...@@ -3493,7 +3513,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3493,7 +3513,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Stop CPU timing for this iteration // Stop CPU timing for this iteration
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count(); double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() / cfg.general.numSubIterations;
if (cfg.data.alwaysValidate) { if (cfg.data.alwaysValidate) {
ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer), ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
...@@ -3528,7 +3548,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3528,7 +3548,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
results.tfrResults.resize(transfers.size()); results.tfrResults.resize(transfers.size());
results.numTimedIterations = numTimedIterations; results.numTimedIterations = numTimedIterations;
results.totalBytesTransferred = 0; results.totalBytesTransferred = 0;
results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / (numTimedIterations * cfg.general.numSubIterations); results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / numTimedIterations;
results.overheadMsec = results.avgTotalDurationMsec; results.overheadMsec = results.avgTotalDurationMsec;
for (auto& exeInfoPair : executorMap) { for (auto& exeInfoPair : executorMap) {
ExeDevice const& exeDevice = exeInfoPair.first; ExeDevice const& exeDevice = exeInfoPair.first;
...@@ -3537,7 +3557,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3537,7 +3557,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Copy over executor results // Copy over executor results
ExeResult& exeResult = results.exeResults[exeDevice]; ExeResult& exeResult = results.exeResults[exeDevice];
exeResult.numBytes = exeInfo.totalBytes; exeResult.numBytes = exeInfo.totalBytes;
exeResult.avgDurationMsec = exeInfo.totalDurationMsec / (numTimedIterations * cfg.general.numSubIterations); exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations;
exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec; exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec;
exeResult.sumBandwidthGbPerSec = 0.0; exeResult.sumBandwidthGbPerSec = 0.0;
exeResult.transferIdx.clear(); exeResult.transferIdx.clear();
......
if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to the ROCm installation.")
set(rocm_bin "$ENV{ROCM_PATH}/bin")
else()
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
set(rocm_bin "/opt/rocm/bin")
endif()
if (NOT DEFINED ENV{CXX})
if(EXISTS "${rocm_bin}/amdclang++")
set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
else()
if(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++")
set(rocm_bin "${ROCM_PATH}/llvm/bin")
set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++")
set(rocm_bin "${ROCM_PATH}/llvm/bin")
set(CMAKE_CXX_COMPILER "${rocm_bin}/clang++" CACHE PATH "Path to the C++ compiler")
endif()
endif()
else()
set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
endif()
if (NOT DEFINED ENV{CXXFLAGS})
set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
endif()
if(NOT CMAKE_BUILD_TYPE)
message(STATUS "Setting build type to 'Release' as none was specified.")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
endif()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment