Commit 09f4f11b authored by Oliveira, Daniel's avatar Oliveira, Daniel
Browse files

impr: Library/Client build organization



Change code organization and build options

Code changes related to the following:
  * Build files
    * Options to build client, shared, and static libraries
  * Source code directories
  * Modern C++20 changes
  * Based on TB 1.6.4
  * Formatting
Signed-off-by: default avatarOliveira, Daniel <daniel.oliveira@amd.com>
parent 2d0ecaae
---
## Refer to the following link for the explanation of each params:
# See options here:
# https://clang.llvm.org/docs/ClangFormatStyleOptions.html
# Last update: Tue Jun 28 06:26:41 PM CDT 2022 (daniel.oliveira@amd.com / dmitrii.galantsev@amd.com)
#
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: 4
IndentAccessModifiers: true
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortFunctionsOnASingleLine: true
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
# This is deprecated
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: Yes
BinPackArguments: false
BinPackParameters: false
InsertBraces: true
#BreakBeforeBraces: Attach
BreakBeforeBraces: Custom
BraceWrapping:
AfterClass: true
AfterControlStatement: false
AfterEnum: true
AfterFunction: true
AfterNamespace: true
AfterObjCDeclaration: false
AfterStruct: true
AfterUnion: true
AfterExternBlock: true
BeforeCatch: false
BeforeElse: false
IndentBraces: false
# disabling the below splits, else, they'll just add to the vertical length of source files!
SplitEmptyFunction: false
SplitEmptyRecord: false
SplitEmptyNamespace: false
BreakBeforeBinaryOperators: None
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 100
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
# Kept the below 4 to be the same as `IndentWidth` to keep everything uniform
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
PointerAlignment: Left
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^<ext/.*\.h>'
Priority: 2
- Regex: '^<.*\.h>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IncludeIsMainRegex: '([-_](test|unittest))?$'
IndentCaseLabels: true
IndentPPDirectives: None
IndentWidth: 4
UseTab: Never
# Be consistent with indent-width, even for people who use tab for indentation!
TabWidth: 4
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 4
NamespaceIndentation: None
ObjCBinPackProtocolList: Never
ObjCBlockIndentWidth: 4
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 2100
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
RawStringFormats:
- Language: Cpp
Delimiters:
- cc
- CC
- cpp
- Cpp
- CPP
- 'c++'
- 'C++'
CanonicalDelimiter: ''
- Language: TextProto
Delimiters:
- pb
- PB
- proto
- PROTO
EnclosingFunctions:
- EqualsProto
- EquivToProto
- PARSE_PARTIAL_TEXT_PROTO
- PARSE_TEST_PROTO
- PARSE_TEXT_PROTO
- ParseTextOrDie
- ParseTextProtoOrDie
CanonicalDelimiter: ''
BasedOnStyle: google
# Enabling comment reflow causes doxygen comments to be messed up in their formats!
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 4
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Cpp11
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
##
# Good defaults
# Checks:
# -*, bugprone-*,cert-dcl21-cpp,cert-dcl50-cpp,cert-env33-c,cert-err34-c,cert-err52-cpp,cert-err60-cpp,cert-flp30-c,cert-msc50-cpp,cert-msc51-cpp,cppcoreguidelines-*,-cppcoreguidelines-macro-usage,-cppcoreguidelines-pro-type-reinterpret-cast,-cppcoreguidelines-pro-type-union-access,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-cppcoreguidelines-pro-type-vararg,google-build-using-namespace,google-explicit-constructor,google-global-names-in-headers,google-readability-casting,google-runtime-int,google-runtime-operator,hicpp-*,-hicpp-vararg,misc-*,modernize-*,performance-*,readability-*,-readability-named-parameter
Checks:
-*,
bugprone-*,
clang-analyzer*,
google-*,
hicpp-*
misc-*,
modernize-*,
performance-*
readability-*
...@@ -3,6 +3,40 @@ ...@@ -3,6 +3,40 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.64.00
### Added
- Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes
- Added FILL_COMPRESS to allow more control over input data pattern
- FILL_COMPRESS takes in a comma-separated list of integer percentages (that must add up to 100)
that sets the percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0 data patterns
- Bins:
- 0 - random
- 1 - 1B0 upper 1 byte of each aligned 2 bytes is 0
- 2 - 2B0 upper 2 bytes of each aligned 4 bytes is 0
- 3 - 4B0 upper 4 bytes of each aligned 8 bytes is 0
- 4 - 32B0 upper 32 bytes of each aligned 64-byte line are 0
- FILL_PATTERN will be ignored if FILL_COMPRESS is specified
- Additional details about data patterns generated will be printed if the debug env var DUMP_LINES is
set to a non-zero value, which also corresponds to how many 64 byte lines will be printed
### Modified
- Increased GFX_BLOCKSIZE limit from 512 to 1024 (still requires multiple of 64)
### Fixed
- Fixed bug when using BYTE_OFFSET
## v1.63.00
### Added
- Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds
### Modified
- Removing self-GPU check for DMA engine copies
- Switched to amdclang++ as primary compiler
- healthcheck preset adds HBM testing and support for more MI3XX variants
### Fixed
- Fixed issue when using "P" memory type and specific DMA subengines
- Fixed issue with subiteration timing reports
## v1.62.00 ## v1.62.00
### Added ### Added
- Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores - Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores
......
# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. # MIT License
#
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#
if (DEFINED ENV{ROCM_PATH}) #
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory") cmake_minimum_required(VERSION 3.25)
else()
set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory") #
endif() # Find git and set the GIT_EXECUTABLE variable (good practice, used by submodules)
cmake_minimum_required(VERSION 3.5) find_package(Git QUIET)
if(NOT GIT_FOUND)
project(TransferBench VERSION 1.62.00 LANGUAGES CXX) message(FATAL_ERROR ">> Git is required to build this project. 'git' not found! ")
# Default GPU architectures to build
#==================================================================================================
set(DEFAULT_GPUS
gfx906
gfx908
gfx90a
gfx942
gfx1030
gfx1100
gfx1101
gfx1102
gfx1200
gfx1201)
# Build only for local GPU architecture
if (BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target")
if (COMMAND rocm_local_targets)
rocm_local_targets(DEFAULT_GPUS)
else()
message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
endif()
endif() endif()
set(CMAKE_GIT_EXECUTABLE ${GIT_EXECUTABLE} CACHE FILEPATH "Path to the Git executable.")
#
# Flag to enable / disable verbose output.
set(CMAKE_VERBOSE_MAKEFILE ON)
#
# Export compile commands for linters and auto-completers
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
#
# C++ standard settings
set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ Standard in use")
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_INCLUDE_DIRECTORIES_BEFORE ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
#
# Project specific directories
set(AMD_PROJECT_BASE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Base directory for this project" FORCE)
set(AMD_PROJECT_CMAKE_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake)
set(AMD_PROJECT_CMAKE_MODULES_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake/modules)
list(APPEND CMAKE_MODULE_PATH "${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}")
#
# Build options
option(TRANSFERBENCH_ENGINE_HEADER_ONLY "Make TB engine header-only available (interface target)" OFF)
option(TRANSFERBENCH_ENGINE_STATIC "Build TB static library" ON)
option(TRANSFERBENCH_ENGINE_SHARED "Build TB shared library" OFF)
option(TRANSFERBENCH_CLIENT "Build TransferBench client" ON)
option(TRANSFERBENCH_TREAT_WARNINGS_AS_ERRORS "Treat default warnings as errors" ON)
option(TRANSFERBENCH_COMPRESS_DEBUG_INFO "Compressed debug information" ON)
option(TRANSFERBENCH_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
option(TRANSFERBENCH_ENABLE_NIC_EXEC "Enable RDMA NIC Executor in TransferBench" OFF)
option(TRANSFERBENCH_HARDENING_ENABLED "Build the project with hardening flags" ON)
option(TRANSFERBENCH_LINKER_TRY_MOLD "Give preference to 'Mold' linker (faster) if available" ON)
option(TRANSFERBENCH_ENABLE_CPPCHECK_WARNINGS "CppCheck static analysis warnings (for Developers)" ON)
# Determine which GPU architectures to build for
set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
# Check if clang compiler can offload to GPU_TARGETS #
if (COMMAND rocm_check_target_ids) # Setup build utils
message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}") include(${AMD_PROJECT_CMAKE_DIRECTORY}/build_utils.cmake)
rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
#
# CMake project info
set(AMD_PROJECT_NAME "TransferBench")
set(AMD_PROJECT_PACKAGE_NAME "transferbench")
set(AMD_PROJECT_LIBRARY_NAME "amd_tb")
set(AMD_PROJECT_DESCRIPTION "TransferBench utility")
set(AMD_PROJECT_AUTHOR_ORGANIZATION "AMD, Inc.")
set(AMD_PROJECT_GITHUB_REPO "https://github.com/ROCm/TransferBench")
set(AMD_PROJECT_AUTHOR_DOMAIN "https://www.amd.com")
set(AMD_PROJECT_VERSION_MAJOR "")
set(AMD_PROJECT_VERSION_MINOR "")
set(AMD_PROJECT_VERSION_PATCH "")
#
# Specify name of project to build, and validate requirements
setup_build_version(PROJECT_TARGET_VERSION PROJECT_TARGET_VERSION_TEXT)
set_variable_in_parent(PROJECT_TARGET_BINARY_VERSION ${PROJECT_TARGET_VERSION})
setup_cmake(AMD_PROJECT_NAME PROJECT_TARGET_BINARY_VERSION)
if(PROJECT_TARGET_VERSION AND ((AMD_PROJECT_VERSION_MAJOR STREQUAL "") OR (AMD_PROJECT_VERSION_MINOR STREQUAL "") OR (AMD_PROJECT_VERSION_PATCH STREQUAL "")))
string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\1" AMD_PROJECT_VERSION_MAJOR ${PROJECT_TARGET_VERSION})
string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\2" AMD_PROJECT_VERSION_MINOR ${PROJECT_TARGET_VERSION})
string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\3" AMD_PROJECT_VERSION_PATCH ${PROJECT_TARGET_VERSION})
else() else()
message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.") message(FATAL_ERROR ">> 'PROJECT_TARGET_VERSION' was not properly set!")
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif() endif()
set(AMD_PROJECT_VERSION_FULL "${AMD_PROJECT_VERSION_MAJOR}.${AMD_PROJECT_VERSION_MINOR}.${AMD_PROJECT_VERSION_PATCH}")
#
# Project information
project(${AMD_PROJECT_NAME}
VERSION ${PROJECT_TARGET_VERSION_TEXT}
LANGUAGES CXX
DESCRIPTION ${AMD_PROJECT_DESCRIPTION}
HOMEPAGE_URL ${AMD_PROJECT_GITHUB_REPO}
)
set(COMPILING_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.") if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
message(STATUS "Compiling for ${COMPILING_TARGETS}") set(TRANSFERBENCH_IS_TOP_LEVEL TRUE)
foreach(target ${COMPILING_TARGETS})
list(APPEND static_link_flags --offload-arch=${target})
endforeach()
list(JOIN static_link_flags " " flags_str)
set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
include_directories(${ROCM_PATH}/include)
find_library(IBVERBS_LIBRARY ibverbs)
find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
if (DEFINED ENV{DISABLE_NIC_EXEC})
message(STATUS "Disabling NIC Executor support")
elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs)
else() else()
if (NOT IBVERBS_LIBRARY) set(TRANSFERBENCH_IS_TOP_LEVEL FALSE)
message(WARNING "IBVerbs library not found")
elseif (NOT IBVERBS_INCLUDE_DIR)
message(WARNING "infiniband/verbs.h not found")
endif()
message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
endif() endif()
link_libraries(numa hsa-runtime64 pthread)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp)
target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)
find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH}) #
include(ROCMInstallTargets) # Set the ROCm base path
include(ROCMCreatePackage) setup_rocm_requirements()
set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF) add_build_definitions()
#
# Build info
message(STATUS "[[ Building Project: " ${AMD_PROJECT_NAME} " v." ${PROJECT_TARGET_VERSION_TEXT} " { Build: " ${CMAKE_BUILD_TYPE} "} ]] ...")
set(PACKAGE_NAME TB)
set(LIBRARY_NAME TransferBench)
rocm_install(TARGETS TransferBench COMPONENT devel) #
# --- Auto Dependent Build options ---
include(CMakeDependentOption)
cmake_dependent_option(
AUTO_BUILD_STATIC_FOR_TB_CLIENT
"TRANSFERBENCH_ENGINE_STATIC build auto-enabled for TB client"
ON
"TRANSFERBENCH_CLIENT AND NOT (TRANSFERBENCH_ENGINE_HEADER_ONLY OR TRANSFERBENCH_ENGINE_SHARED OR TRANSFERBENCH_ENGINE_STATIC)"
OFF
)
rocm_package_add_dependencies(DEPENDS numactl hsa-rocr) if(AUTO_BUILD_STATIC_FOR_TB_CLIENT)
set(TRANSFERBENCH_ENGINE_STATIC ON CACHE BOOL "Build TB static library" FORCE)
endif()
rocm_create_package(
NAME ${LIBRARY_NAME} cmake_dependent_option(
DESCRIPTION "TransferBench package" AUTO_BUILD_TB_LIBRARY
MAINTAINER "RCCL Team <gilbert.lee@amd.com>" "Auto-enabled if any TB library option is set"
ON
"TRANSFERBENCH_ENGINE_HEADER_ONLY OR TRANSFERBENCH_ENGINE_SHARED OR TRANSFERBENCH_ENGINE_STATIC"
OFF
) )
if(AUTO_BUILD_TB_LIBRARY)
set(TRANSFERBENCH_ENGINE_REQUIRED ON CACHE BOOL "TB engine is required" FORCE)
endif()
# Validate conflicting build options
if(TRANSFERBENCH_CLIENT AND TRANSFERBENCH_ENGINE_HEADER_ONLY AND (TRANSFERBENCH_ENGINE_SHARED OR TRANSFERBENCH_ENGINE_STATIC))
message(FATAL_ERROR ">> Conflicting build options: CLIENT cannot be built with: HEADER_ONLY and STATIC or SHARED! ")
endif()
# ---
if(TRANSFERBENCH_ENGINE_REQUIRED)
add_subdirectory(deps/tbengine)
endif()
if(TRANSFERBENCH_CLIENT)
add_subdirectory(client)
endif()
#
# Packaging
if(TRANSFERBENCH_ENGINE_REQUIRED OR TRANSFERBENCH_CLIENT)
##rocm_install(TARGETS ${AMD_PROJECT_NAME} COMPONENT devel)
rocm_package_add_dependencies(DEPENDS "numactl" "hsa-rocr")
rocm_setup_version(VERSION ${VERSION_STRING})
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
rocm_create_package(
NAME ${AMD_PROJECT_PACKAGE_NAME}
DESCRIPTION "TransferBench package"
MAINTAINER "RCCL Team <gilbert.lee@amd.com>"
)
endif()
## End of CMakeLists.txt
...@@ -6,57 +6,81 @@ ...@@ -6,57 +6,81 @@
ROCM_PATH ?= /opt/rocm ROCM_PATH ?= /opt/rocm
CUDA_PATH ?= /usr/local/cuda CUDA_PATH ?= /usr/local/cuda
HIPCC=$(ROCM_PATH)/bin/hipcc HIPCC ?= $(ROCM_PATH)/bin/amdclang++
NVCC=$(CUDA_PATH)/bin/nvcc NVCC ?= $(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected # This can be a space separated string of multiple GPU targets
ifeq ("$(shell test -e $(NVCC) && echo found)", "found") # Default is the native GPU target
EXE=TransferBenchCuda GPU_TARGETS ?= native
CXX=$(NVCC)
else
EXE=TransferBench
CXX=$(HIPCC)
endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 DEBUG ?= 0
NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets ifeq ($(filter clean,$(MAKECMDGOALS)),)
LDFLAGS += -lpthread # Compile TransferBenchCuda if nvidia-smi returns successfully and nvcc detected
ifeq ("$(shell nvidia-smi > /dev/null 2>&1 && test -e $(NVCC) && echo found)", "found")
# Compile RDMA executor if EXE=TransferBenchCuda
# 1) DISABLE_NIC_EXEC is not set to 1 CXX=$(NVCC)
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
NIC_ENABLED = 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
$(info lib IBVerbs not found)
else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
$(info infiniband/verbs.h not found)
else else
LDFLAGS += -libverbs -DNIC_EXEC_ENABLED EXE=TransferBench
NVFLAGS += -libverbs -DNIC_EXEC_ENABLED ifeq ("$(shell test -e $(HIPCC) && echo found)", "found")
NIC_ENABLED = 1 CXX=$(HIPCC)
else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
CXX=$(ROCM_PATH)/bin/hipcc
$(warning "Could not find $(HIPCC). Using fallback to $(CXX)")
else
$(error "Could not find $(HIPCC) or $(ROCM_PATH)/bin/hipcc. Check if the path is correct if you want to build $(EXE)")
endif
GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
endif endif
ifeq ($(NIC_ENABLED), 0)
$(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed) CXXFLAGS = -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
HIPLDFLAGS= -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64
HIPFLAGS = -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
NVFLAGS = -x cu -lnuma -arch=native
ifeq ($(DEBUG), 0)
COMMON_FLAGS += -O3
else
COMMON_FLAGS += -O0 -g -ggdb3
endif
COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread
NIC_ENABLED = 0
# Compile RDMA executor if
# 1) DISABLE_NIC_EXEC is not set to 1
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
DISABLE_NIC_EXEC ?= 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
$(info lib IBVerbs not found)
else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
$(info infiniband/verbs.h not found)
else
CXXFLAGS += -DNIC_EXEC_ENABLED
LDFLAGS += -libverbs
NIC_ENABLED = 1
endif
ifeq ($(NIC_ENABLED), 0)
$(info Building without NIC executor support)
$(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
else
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
endif
endif endif
endif endif
.PHONY : all clean
all: $(EXE) all: $(EXE)
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS) $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) $(COMMON_FLAGS) $< -o $@ $(HIPLDFLAGS) $(LDFLAGS)
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS) $(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
clean: clean:
rm -f *.o ./TransferBench ./TransferBenchCuda rm -f ./TransferBench ./TransferBenchCuda
NicStatus:
ifeq ($(NIC_ENABLED), 1)
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
else
$(info Building without NIC executor support)
endif
1.64.0
# MIT License
#
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#
#
cmake_minimum_required(VERSION 3.25)
# Load CMake modules
#==================================================================================================
set(AMD_PROJECT_CMAKE_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake)
set(AMD_PROJECT_CMAKE_MODULES_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake/modules)
list(APPEND CMAKE_MODULE_PATH "${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}")
# CMake Toolchain file to define compilers and path to ROCm
#==================================================================================================
if (NOT CMAKE_TOOLCHAIN_FILE)
set(CMAKE_TOOLCHAIN_FILE "${AMD_PROJECT_CMAKE_DIRECTORY}/rocm_clang_toolchain.cmake")
message(STATUS ">> CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif()
#
# Build options
option(TRANSFERBENCH_CLIENT_USE_ENGINE_STATIC "TransferBench client links with 'static' library (default)" ON)
option(TRANSFERBENCH_CLIENT_USE_ENGINE_SHARED "TransferBench client links with 'shared' library" OFF)
option(TRANSFERBENCH_CLIENT_USE_HEADER_ONLY "TransferBench uses 'header-only' interface" OFF)
#
include(${AMD_PROJECT_CMAKE_DIRECTORY}/build_utils.cmake) # setup_default_compiler_flags
include(${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}/Dependencies.cmake) # rocm-cmake, rocm_local_targets
#
set (TRANSFERBENCH_TBENGINE_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/deps/tbengine)
set (TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY ${TRANSFERBENCH_TBENGINE_DIRECTORY}/include)
set (TRANSFERBENCH_CLIENT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
set (TRANSFERBENCH_CLIENT_INCLUDE_DIRECTORY ${TRANSFERBENCH_CLIENT_DIRECTORY}/include)
set (TRANSFERBENCH_CLIENT_PRESETS_INCLUDE_DIRECTORY ${TRANSFERBENCH_CLIENT_INCLUDE_DIRECTORY}/Presets)
set (TRANSFERBENCH_CLIENT_SRC_DIRECTORY ${TRANSFERBENCH_CLIENT_DIRECTORY}/src)
#
# Library names
set(AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_header")
set(TRANSFERBENCH_INTERFACE_TARGET_NAME "${AMD_PROJECT_PACKAGE_NAME}_engine")
set(TRANSFERBENCH_INTERFACE_TARGET_NAME_ALIAS "${AMD_PROJECT_PACKAGE_NAME}::engine")
set(AMD_PROJECT_STATIC_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_static")
set(AMD_PROJECT_SHARED_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_shared")
set(AMD_PROJECT_OBJECT_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_object_library")
set(AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_hip_object_library")
set(AMD_PROJECT_CLIENT_NAME "${AMD_PROJECT_NAME}")
#
# Subproject build information
setup_build_version(TRANSFERBENCH_CLIENT_TARGET_VERSION TRANSFERBENCH_CLIENT_TARGET_VERSION_TEXT)
set_variable_in_parent(TRANSFERBENCH_CLIENT_TARGET_BINARY_VERSION ${TRANSFERBENCH_CLIENT_TARGET_VERSION})
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/include/)
configure_file(
${AMD_PROJECT_CMAKE_DIRECTORY}/tbclient_version.hpp.in
${CMAKE_BINARY_DIR}/include/tbclient_version.hpp
@ONLY
)
#
# Header/Source files
set(TRANSFERBENCH_CLIENT_SOURCES
${TRANSFERBENCH_CLIENT_SRC_DIRECTORY}/Client.cpp
)
set(TRANSFERBENCH_COMMON_INCLUDES
${TRANSFERBENCH_CLIENT_INCLUDE_DIRECTORY}
${TRANSFERBENCH_CLIENT_PRESETS_INCLUDE_DIRECTORY}
)
#
message(STATUS ">> Building TransferBench 'client' ...")
developer_status_message("DEVEL" " >> TRANSFERBENCH_CLIENT_TARGET_VERSION: '${TRANSFERBENCH_CLIENT_TARGET_VERSION}' ")
add_executable(${AMD_PROJECT_CLIENT_NAME} ${TRANSFERBENCH_CLIENT_SOURCES})
set_target_properties(${AMD_PROJECT_CLIENT_NAME}
PROPERTIES
LINKER_LANGUAGE CXX
)
target_link_libraries(${AMD_PROJECT_CLIENT_NAME}
PRIVATE
${TRANSFERBENCH_INTERFACE_TARGET_NAME_ALIAS}
)
target_include_directories(${AMD_PROJECT_CLIENT_NAME}
PRIVATE
${TRANSFERBENCH_COMMON_INCLUDES}
${CMAKE_BINARY_DIR}/include/
${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}
)
setup_default_compiler_flags(${AMD_PROJECT_CLIENT_NAME})
#
# Linking the appropriate TransferBench library
# Priority: Static > Shared > Header-Only
set(WAS_TB_LINKED OFF)
if(TRANSFERBENCH_ENGINE_STATIC)
message(STATUS " >> Client build with 'static' library: ${AMD_PROJECT_STATIC_LIBRARY_NAME} ...")
target_link_libraries(${AMD_PROJECT_CLIENT_NAME}
PRIVATE
${AMD_PROJECT_STATIC_LIBRARY_NAME}
)
set(WAS_TB_LINKED ON)
elseif(TRANSFERBENCH_ENGINE_SHARED)
message(STATUS " >> Client build with 'shared' library: ${AMD_PROJECT_SHARED_LIBRARY_NAME} ...")
target_link_libraries(${AMD_PROJECT_CLIENT_NAME}
PRIVATE
${AMD_PROJECT_SHARED_LIBRARY_NAME}
)
set(WAS_TB_LINKED ON)
elseif(TRANSFERBENCH_ENGINE_HEADER_ONLY)
message(STATUS " >> Client build with 'header-only': ${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME} ...")
target_link_libraries(${AMD_PROJECT_CLIENT_NAME}
PRIVATE
${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME}
)
target_compile_definitions(${AMD_PROJECT_CLIENT_NAME}
PRIVATE
TRANSFERBENCH_HEADER_IMPLEMENTATION_DETAILS
)
endif()
# Install client executable
install(
TARGETS ${AMD_PROJECT_CLIENT_NAME}
RUNTIME DESTINATION bin
)
...@@ -22,13 +22,21 @@ THE SOFTWARE. ...@@ -22,13 +22,21 @@ THE SOFTWARE.
#pragma once #pragma once
// TransferBench client version
#define CLIENT_VERSION "00"
#include "TransferBench.hpp" #include "TransferBench.hpp"
#include "tbclient_version.hpp"
// Helper function to print client version
auto GetClientVersion() -> const std::string;
/*
* TODO: We need to look into this circular dependency (envVars->Client->envVars)
*/
#include "EnvVars.hpp" #include "EnvVars.hpp"
size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<28);
size_t const DEFAULT_BYTES_PER_TRANSFER = (1 << 28);
char const ExeTypeName[5][4] = {"CPU", "GPU", "DMA", "NIC", "NIC"}; char const ExeTypeName[5][4] = {"CPU", "GPU", "DMA", "NIC", "NIC"};
...@@ -39,7 +47,8 @@ void DisplayTopology(bool outputToCsv); ...@@ -39,7 +47,8 @@ void DisplayTopology(bool outputToCsv);
void DisplayUsage(char const* cmdName); void DisplayUsage(char const* cmdName);
// Print TransferBench test results // Print TransferBench test results
void PrintResults(EnvVars const& ev, int const testNum, void PrintResults(EnvVars const& ev,
int const testNum,
std::vector<Transfer> const& transfers, std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results); TransferBench::TestResults const& results);
......
This diff is collapsed.
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ALL_TO_ALL_PRESET_HPP
#define ALL_TO_ALL_PRESET_HPP
#include "EnvVars.hpp"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <map>
#include <string>
#include <utility>
#include <vector>
void AllToAllPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
[[maybe_unused]] std::string const presetName)
{
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2,
A2A_CUSTOM = 3,
};
char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
// Force to gfx unroll 2 unless explicitly set
ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT", 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL", 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC", 8);
int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC", 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
// A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
int numSrcs, numDsts;
int a2aMode = 0;
if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
a2aMode = A2A_CUSTOM;
} else {
a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
if (a2aMode < 0 || a2aMode > 2) {
printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
exit(1);
}
numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
}
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) { printf("[AllToAll Related]\n"); }
ev.Print(
"A2A_DIRECT", a2aDirect, a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL", a2aLocal, "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE",
(a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts)
: std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)")
.c_str()
: a2aModeStr[a2aMode]);
ev.Print("NUM_GPU_DEVICES", numGpus, "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS",
numQueuePairs,
"Using %d queue pairs for NIC transfers",
numQueuePairs);
ev.Print(
"NUM_SUB_EXEC", numSubExecs, "Using %d subexecutors/CUs per Transfer", numSubExecs);
ev.Print("USE_DMA_EXEC", useDmaExec, "Using %s executor", useDmaExec ? "DMA" : "GFX");
ev.Print("USE_FINE_GRAIN",
useFineGrain,
"Using %s-grained memory",
useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ",
useRemoteRead,
"Using %s as executor",
useRemoteRead ? "DST" : "SRC");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
if (useDmaExec && (numSrcs != 1 || numDsts != 1)) {
printf("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n");
exit(1);
}
// Collect the number of GPU devices to use
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpus; i++) {
for (int j = 0; j < numGpus; j++) {
// Check whether or not to execute this pair
if (i == j) {
if (!a2aLocal) { continue; }
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) { continue; }
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
for (int x = 0; x < numSrcs; x++) { transfer.srcs.push_back({memType, i}); }
// When using multiple destinations, the additional destinations are "local"
if (numDsts) { transfer.dsts.push_back({memType, j}); }
for (int x = 1; x < numDsts; x++) { transfer.dsts.push_back({memType, i}); }
transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
transfer.exeSubIndex = -1;
transfer.numSubExecs = numSubExecs;
reIndex[std::make_pair(i, j)] = transfers.size();
transfers.push_back(transfer);
}
}
// Create a ring using NICs
std::vector<int> nicTransferIdx(numGpus);
if (numQueuePairs > 0) {
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
for (int i = 0; i < numGpus; i++) {
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({memType, i});
transfer.dsts.push_back({memType, (i + 1) % numGpus});
transfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, i};
transfer.exeSubIndex = (i + 1) % numGpus;
transfer.numSubExecs = numQueuePairs;
nicTransferIdx[i] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
numBytesPerTransfer,
a2aDirect ? "directly connected" : "all",
numSubExecs,
transfers.size());
if (transfers.size() == 0) { return; }
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults) { printf("%s\n", err.errMsg.c_str()); }
exit(0);
} else {
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
numBytesPerTransfer,
useDmaExec ? "DMA" : "GFX",
numSubExecs,
numSrcs,
numDsts);
printf("===========================================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++) { printf("%cGPU %02d ", separator, dst); }
if (numQueuePairs > 0) { printf("%cNIC(%02d QP)", separator, numQueuePairs); }
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minActualBandwidth = std::numeric_limits<double>::max();
double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus + 2, 0.0);
for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0;
int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) {
int const transferIdx = reIndex[std::make_pair(src, dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else {
printf("%c%8s ", separator, "N/A");
}
}
if (numQueuePairs > 0) {
TransferBench::TransferResult const& r = results.tfrResults[nicTransferIdx[src]];
colTotalBandwidth[numGpus] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus + 1] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
if (numQueuePairs > 0) { printf("%c%8.3f ", separator, colTotalBandwidth[numGpus]); }
printf(" %c%8.3f %c%8.3f %c%8.3f\n",
separator,
colTotalBandwidth[numGpus + 1],
separator,
minActualBandwidth,
separator,
maxActualBandwidth);
printf("\n");
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
#endif // ALL_TO_ALL_PRESET_HPP
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ALL_TO_ALL_N_PRESET_HPP
#define ALL_TO_ALL_N_PRESET_HPP
#include "EnvVars.hpp"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <map>
#include <string>
#include <utility>
#include <vector>
void AllToAllRdmaPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
[[maybe_unused]] std::string const presetName)
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 1);
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) { printf("[AllToAll Network Related]\n"); }
ev.Print("NUM_GPU_DEVICES", numGpus, "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS",
numQueuePairs,
"Using %d queue pairs for NIC transfers",
numQueuePairs);
ev.Print("USE_FINE_GRAIN",
useFineGrain,
"Using %s-grained memory",
useFineGrain ? "fine" : "coarse");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpus; i++) {
for (int j = 0; j < numGpus; j++) {
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({memType, i});
transfer.dsts.push_back({memType, j});
transfer.exeDevice = {EXE_NIC_NEAREST, i};
transfer.exeSubIndex = j;
transfer.numSubExecs = numQueuePairs;
reIndex[std::make_pair(i, j)] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-RDMA All-To-All benchmark:\n");
printf("==========================\n");
printf(
"- Copying %lu bytes between all pairs of GPUs using %d QPs per Transfer (%lu Transfers)\n",
numBytesPerTransfer,
numQueuePairs,
transfers.size());
if (transfers.size() == 0) { return; }
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults) { printf("%s\n", err.errMsg.c_str()); }
exit(0);
} else {
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
printf("==========================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++) { printf("%cGPU %02d ", separator, dst); }
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minActualBandwidth = std::numeric_limits<double>::max();
double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus + 2, 0.0);
for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0;
int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) {
int const transferIdx = reIndex[std::make_pair(src, dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else {
printf("%c%8s ", separator, "N/A");
}
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus + 1] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n",
separator,
colTotalBandwidth[numGpus + 1],
separator,
minActualBandwidth,
separator,
maxActualBandwidth);
printf("\n");
printf("Average bandwidth (Tx Thread Timed): %8.3f GB/s\n",
totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n",
results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
#endif // ALL_TO_ALL_N_PRESET_HPP
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ALL_TO_ALL_SWEEP_PRESET_HPP
#define ALL_TO_ALL_SWEEP_PRESET_HPP
#include "EnvVars.hpp"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <map>
#include <string>
#include <utility>
#include <vector>
void AllToAllSweepPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
[[maybe_unused]] std::string const presetName)
{
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2,
A2A_CUSTOM = 3,
};
char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT", 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL", 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int showMinOnly = EnvVars::GetEnvVar("SHOW_MIN_ONLY", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0);
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
std::vector<int> blockList = EnvVars::GetEnvVarArray("BLOCKSIZES", {256});
std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1, 2, 3, 4, 6, 8});
std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4, 8, 12, 16, 24, 32});
// A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
int numSrcs, numDsts;
int a2aMode = 0;
if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
a2aMode = A2A_CUSTOM;
} else {
a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
if (a2aMode < 0 || a2aMode > 2) {
printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
exit(1);
}
numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
}
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) { printf("[AllToAll Related]\n"); }
ev.Print(
"A2A_DIRECT", a2aDirect, a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL", a2aLocal, "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE",
(a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts)
: std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)")
.c_str()
: a2aModeStr[a2aMode]);
ev.Print("BLOCKSIZES", blockList.size(), EnvVars::ToStr(blockList).c_str());
ev.Print("SHOW_MIN_ONLY",
showMinOnly,
showMinOnly ? "Showing only slowest GPU results"
: "Showing slowest and fastest GPU results");
ev.Print("NUM_CUS", numCusList.size(), EnvVars::ToStr(numCusList).c_str());
ev.Print("NUM_GPU_DEVICES", numGpus, "Using %d GPUs", numGpus);
ev.Print("UNROLLS", unrollList.size(), EnvVars::ToStr(unrollList).c_str());
ev.Print("USE_FINE_GRAIN",
useFineGrain,
"Using %s-grained memory",
useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ",
useRemoteRead,
"Using %s as executor",
useRemoteRead ? "DST" : "SRC");
ev.Print("USE_SPRAY", useSpray, "%s per CU", useSpray ? "All targets" : "One target");
ev.Print("VERBOSE", verbose, verbose ? "Display test results" : "Display summary only");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
if (useSpray && numDsts > 1) {
printf("[ERROR] Cannot use USE_SPRAY with multiple destination buffers\n");
exit(1);
}
// Collect the number of GPU devices to use
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = EXE_GPU_GFX;
std::vector<Transfer> transfers;
int targetCount = 0;
if (!useSpray) {
// Each CU will work on just one target
for (int i = 0; i < numGpus; i++) {
targetCount = 0;
for (int j = 0; j < numGpus; j++) {
// Check whether or not to execute this pair
if (i == j) {
if (!a2aLocal) { continue; }
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) { continue; }
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
targetCount++;
transfer.numBytes = numBytesPerTransfer;
for (int x = 0; x < numSrcs; x++) { transfer.srcs.push_back({memType, i}); }
// When using multiple destinations, the additional destinations are "local"
if (numDsts) { transfer.dsts.push_back({memType, j}); }
for (int x = 1; x < numDsts; x++) { transfer.dsts.push_back({memType, i}); }
transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
transfer.exeSubIndex = -1;
transfers.push_back(transfer);
}
}
} else {
// Each CU will work on all targets
for (int i = 0; i < numGpus; i++) {
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.exeDevice = {exeType, i};
transfer.exeSubIndex = -1;
targetCount = 0;
for (int j = 0; j < numGpus; j++) {
// Check whether or not to transfer to this GPU
if (i == j) {
if (!a2aLocal) { continue; }
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) { continue; }
#endif
}
targetCount++;
for (int x = 0; x < numSrcs; x++) {
transfer.srcs.push_back({memType, useRemoteRead ? j : i});
}
if (numDsts) { transfer.dsts.push_back({memType, j}); }
for (int x = 1; x < numDsts; x++) { transfer.dsts.push_back({memType, i}); }
}
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All Sweep benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs\n",
numBytesPerTransfer,
a2aDirect ? "directly connected" : "all");
if (transfers.size() == 0) {
printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
return;
}
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
// Run tests
std::map<std::pair<int, int>, TransferBench::TestResults> results;
// Display summary
for (int blockSize : blockList) {
printf("Blocksize: %d\n", blockSize);
ev.gfxBlockSize = cfg.gfx.blockSize = blockSize;
printf("#CUs\\Unroll");
for (int u : unrollList) {
printf(" %d(Min) ", u);
if (!showMinOnly) { printf(" %d(Max) ", u); }
}
printf("\n");
for (int c : numCusList) {
printf(" %5d ", c);
fflush(stdout);
for (int u : unrollList) {
ev.gfxUnroll = cfg.gfx.unrollFactor = u;
for (auto& transfer : transfers) {
transfer.numSubExecs = useSpray ? (c * targetCount) : c;
}
double minBandwidth = std::numeric_limits<double>::max();
double maxBandwidth = std::numeric_limits<double>::min();
TransferBench::TestResults result;
if (TransferBench::RunTransfers(cfg, transfers, result)) {
for (auto const& exeResult : result.exeResults) {
minBandwidth = std::min(minBandwidth,
exeResult.second.avgBandwidthGbPerSec);
maxBandwidth = std::max(maxBandwidth,
exeResult.second.avgBandwidthGbPerSec);
}
if (useSpray) {
minBandwidth *= targetCount;
maxBandwidth *= targetCount;
}
results[std::make_pair(c, u)] = result;
} else {
minBandwidth = 0.0;
}
printf(" %7.2f ", minBandwidth);
if (!showMinOnly) { printf(" %7.2f ", maxBandwidth); }
fflush(stdout);
}
printf("\n");
fflush(stdout);
}
if (verbose) {
int testNum = 0;
for (int c : numCusList) {
for (int u : unrollList) {
printf("CUs: %d Unroll %d\n", c, u);
PrintResults(ev, ++testNum, transfers, results[std::make_pair(c, u)]);
}
}
}
}
}
#endif // ALL_TO_ALL_SWEEP_PRESET_HPP
/*
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef HEALTH_CHECK_PRESET_HPP
#define HEALTH_CHECK_PRESET_HPP
#include "EnvVars.hpp"
#include <cstddef>
#include <cstdint>
#include <map>
#include <string>
#include <vector>
enum
{
HBM_READ = 0,
HBM_WRITE = 1,
HBM_COPY = 2,
HBM_ADD = 3,
NUM_HBM_TESTS = 4
} HbmTests;
struct HbmTestConfig
{
std::string name;
int numInputs;
int numOutputs;
};
HbmTestConfig HbmTestConfigs[NUM_HBM_TESTS] = {
{"READ", 1, 0}, {"WRITE", 0, 1}, {"COPY", 1, 1}, {"ADD", 2, 1}};
typedef struct
{
double unidirHostToDeviceCopyLimit;
double unidirDeviceToHostCopyLimit;
double bidirDmaCopyLimit;
int a2aUnrollFactor;
int a2aNumSubExecs;
double a2aCopyLimit;
int hbmBlockSize[NUM_HBM_TESTS];
int hbmUnrollFactor[NUM_HBM_TESTS];
int hbmTemporalMode[NUM_HBM_TESTS];
double hbmLimit[NUM_HBM_TESTS];
} TestConfig;
typedef enum
{
MODEL_08_GFX0942_304 = 0,
MODEL_08_GFX0942_064 = 1,
NUM_SUPPORTED_MODELS = 2
} ModelEnum;
// All limits are scaled by this factor
#define SFACTOR 0.97
TestConfig Config_08_GFX0942_304 = {
.unidirHostToDeviceCopyLimit = 50,
.unidirDeviceToHostCopyLimit = 50,
.bidirDmaCopyLimit = 90,
.a2aUnrollFactor = 2,
.a2aNumSubExecs = 8,
.a2aCopyLimit = 45,
.hbmBlockSize = {384, 256, 320, 256},
.hbmUnrollFactor = {7, 4, 8, 7},
.hbmTemporalMode = {3, 3, 3, 3},
.hbmLimit = {4980, 4850, 2045, 1405},
};
TestConfig Config_08_GFX0942_064 = {
.unidirHostToDeviceCopyLimit = 50,
.unidirDeviceToHostCopyLimit = 50,
.bidirDmaCopyLimit = 90,
.a2aUnrollFactor = 2,
.a2aNumSubExecs = 8,
.a2aCopyLimit = 45,
.hbmBlockSize = {448, 448, 448, 384},
.hbmUnrollFactor = {8, 3, 8, 7},
.hbmTemporalMode = {3, 3, 3, 3},
.hbmLimit = {4180, 2800, 1400, 1055},
};
TestConfig TestConfigs[NUM_SUPPORTED_MODELS] = {
Config_08_GFX0942_304,
Config_08_GFX0942_064,
};
int DetectModel()
{
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
std::string archName = "";
int numSubExecutors = 0;
// Loop over all GPUs and determine if they are identical
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
// Check that arch name is identical
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
std::string fullName = prop.gcnArchName;
std::string currArchName = fullName.substr(0, fullName.find(':'));
if (archName != "" && archName != currArchName) {
printf(
"[WARN] healthcheck preset is currently only supported when all GPUs are "
"identical\n");
printf(" Detected both %s and %s\n", archName.c_str(), currArchName.c_str());
exit(1);
}
archName = currArchName;
// Check number of subexecutors
int currNumSubExecutors = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
if (numSubExecutors != 0 && numSubExecutors != currNumSubExecutors) {
printf(
"[WARN] healthcheck preset is currently only supported when all GPUs are "
"identical\n");
printf(" Detected different subexecutor counts: %d and %d\n",
numSubExecutors,
currNumSubExecutors);
exit(1);
}
numSubExecutors = currNumSubExecutors;
}
// Classify based on detected configuration
if (numGpuDevices == 8) {
if (archName == "gfx942") {
switch (numSubExecutors) {
case 304: return MODEL_08_GFX0942_304;
case 64: return MODEL_08_GFX0942_064;
}
}
}
printf("[WARN] healthcheck preset is currently not supported on this hardware\n");
printf(" Detected %d x [%s] with [%d] subexecutors per GPU\n",
numGpuDevices,
archName.c_str(),
numSubExecutors);
exit(1);
}
int TestUnidir(int modelId, bool verbose)
{
TestConfig const& testConfig = TestConfigs[modelId];
TransferBench::ConfigOptions cfg;
TransferBench::TestResults results;
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
cfg.dma.useHsaCopy = 1;
// Run unidirectional host to device copy
printf("Testing unidirectional host to device copy%c", verbose ? '\n' : ' ');
{
double limit = testConfig.unidirHostToDeviceCopyLimit * SFACTOR;
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
if (!verbose) { printf("."); }
fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_DMA, gpuId};
t.numBytes = 256 * 1024 * 1024;
t.srcs = {{MEM_CPU, memIndex}};
t.dsts = {{MEM_GPU, gpuId}};
t.numSubExecs = 1;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
if (measuredBw < limit) { fails.push_back(std::make_pair(gpuId, measuredBw)); }
if (verbose) {
printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
}
} else {
PrintErrors(results.errResults);
}
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n",
p.first,
p.second,
limit);
}
}
}
// Run unidirectional device to host copy
printf("Testing unidirectional device to host copy%c", verbose ? '\n' : ' ');
{
double limit = testConfig.unidirDeviceToHostCopyLimit * SFACTOR;
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
if (!verbose) { printf("."); }
fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_DMA, gpuId};
t.numBytes = 256 * 1024 * 1024;
t.srcs = {{MEM_GPU, gpuId}};
t.dsts = {{MEM_CPU, memIndex}};
t.numSubExecs = 1;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
if (measuredBw < limit) { fails.push_back(std::make_pair(gpuId, measuredBw)); }
if (verbose) {
printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
}
} else {
PrintErrors(results.errResults);
}
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n",
p.first,
p.second,
limit);
}
}
}
return hasFail;
}
int TestBidir(int modelId, bool verbose)
{
TestConfig const& testConfig = TestConfigs[modelId];
TransferBench::ConfigOptions cfg;
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
printf("Testing bidirectional host<->device copies%c", verbose ? '\n' : ' ');
{
double limit = testConfig.bidirDmaCopyLimit * SFACTOR;
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
if (!verbose) { printf("."); }
fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(2);
Transfer& t0 = transfers[0];
Transfer& t1 = transfers[1];
t0.exeDevice = {EXE_GPU_DMA, gpuId};
t0.numBytes = 256 * 1024 * 1024;
t0.srcs = {{MEM_GPU, gpuId}};
t0.dsts = {{MEM_CPU, memIndex}};
t0.numSubExecs = 1;
t1.exeDevice = {EXE_GPU_DMA, gpuId};
t1.numBytes = 256 * 1024 * 1024;
t1.srcs = {{MEM_CPU, memIndex}};
t1.dsts = {{MEM_GPU, gpuId}};
t1.numSubExecs = 1;
TransferBench::TestResults results;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
double measuredBw = (results.tfrResults[0].avgBandwidthGbPerSec +
results.tfrResults[1].avgBandwidthGbPerSec);
if (measuredBw < limit) { fails.push_back(std::make_pair(gpuId, measuredBw)); }
if (verbose) {
printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
}
} else {
PrintErrors(results.errResults);
}
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n",
p.first,
p.second,
limit);
}
}
}
return hasFail;
}
int TestAllToAll(int modelId, bool verbose)
{
TestConfig const& testConfig = TestConfigs[modelId];
TransferBench::ConfigOptions cfg;
cfg.gfx.unrollFactor = testConfig.a2aUnrollFactor;
int numSubExecs = testConfig.a2aNumSubExecs;
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
printf("Testing all-to-all XGMI copies %c", verbose ? '\n' : ' ');
fflush(stdout);
{
double limit = testConfig.a2aCopyLimit * SFACTOR;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpuDevices; i++) {
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) { continue; }
Transfer t;
t.numBytes = 256 * 1024 * 1024;
t.numSubExecs = numSubExecs;
t.exeDevice = {EXE_GPU_GFX, i};
t.srcs = {{MEM_GPU_FINE, i}};
t.dsts = {{MEM_GPU_FINE, j}};
transfers.push_back(t);
}
}
std::vector<std::pair<std::pair<int, int>, double>> fails;
TransferBench::TestResults results;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
int transferIdx = 0;
for (int i = 0; i < numGpuDevices; i++) {
if (!verbose) { printf("."); }
fflush(stdout);
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) { continue; }
double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
if (bw < limit) { fails.push_back(std::make_pair(std::make_pair(i, j), bw)); }
if (verbose) {
printf(" GPU %02d to GPU %02d: : Measured %6.2f Limit %6.2f\n",
i,
j,
bw,
limit);
}
transferIdx++;
}
}
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n",
p.first.first,
p.first.second,
p.second,
limit);
}
}
}
return hasFail;
}
int TestHbmPerformance(int modelId, bool verbose)
{
TestConfig const& testConfig = TestConfigs[modelId];
int hasFail = 0;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
char testname[50];
for (int testId = 0; testId < NUM_HBM_TESTS; testId++) {
TransferBench::ConfigOptions cfg;
cfg.general.numIterations = 1000;
cfg.general.numWarmups = 50;
cfg.gfx.blockSize = testConfig.hbmBlockSize[testId];
cfg.gfx.unrollFactor = testConfig.hbmUnrollFactor[testId];
cfg.gfx.temporalMode = testConfig.hbmTemporalMode[testId];
sprintf(testname, "Testing HBM performance [%s]", HbmTestConfigs[testId].name.c_str());
if (verbose) {
printf("[Blocksize: %d Unroll: %d TemporalMode: %d]\n",
cfg.gfx.blockSize,
cfg.gfx.unrollFactor,
cfg.gfx.temporalMode);
}
printf("%-42s%c", testname, verbose ? '\n' : ' ');
fflush(stdout);
int numInputs = HbmTestConfigs[testId].numInputs;
int numOutputs = HbmTestConfigs[testId].numOutputs;
double limit = testConfig.hbmLimit[testId] * SFACTOR;
std::vector<std::pair<int, double>> fails;
TransferBench::TestResults results;
std::vector<Transfer> transfers;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
Transfer t;
t.numSubExecs = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
t.numBytes = 16777216ULL * t.numSubExecs;
t.exeDevice = {EXE_GPU_GFX, gpuId};
for (int i = 0; i < numInputs; i++) { t.srcs.push_back({MEM_GPU, gpuId}); }
for (int i = 0; i < numOutputs; i++) { t.dsts.push_back({MEM_GPU, gpuId}); }
transfers.push_back(t);
}
if (TransferBench::RunTransfers(cfg, transfers, results)) {
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
if (!verbose) { printf("."); }
fflush(stdout);
double measuredBw = results.tfrResults[gpuId].avgBandwidthGbPerSec;
if (measuredBw < limit) { fails.push_back(std::make_pair(gpuId, measuredBw)); }
if (verbose) {
printf(" GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
}
}
} else {
PrintErrors(results.errResults);
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = 1;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n",
p.first,
p.second,
limit);
}
}
}
return hasFail;
}
void HealthCheckPreset([[maybe_unused]] EnvVars& ev,
[[maybe_unused]] size_t const numBytesPerTransfer,
[[maybe_unused]] std::string const presetName)
{
// Check for supported platforms
#if defined(__NVCC__)
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
return;
#endif
printf("Disclaimer:\n");
printf("==================================================================\n");
printf("NOTE: This is an experimental feature and may be subject to change\n");
printf(" Failures do not necessarily indicate hardware issues, as other factors\n");
printf(" such as simultaneous workloads may influence results\n");
printf("\n");
// Collect custom env vars for this preset
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
// Determine if this is a supported model
int modelId = DetectModel();
// Run through all tests
int numFails = 0;
numFails += TestHbmPerformance(modelId, verbose);
numFails += TestUnidir(modelId, verbose);
numFails += TestBidir(modelId, verbose);
numFails += TestAllToAll(modelId, verbose);
exit(numFails ? 1 : 0);
}
#endif // HEALTH_CHECK_PRESET_HPP
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ONE_TO_ALL_PRESET_HPP
#define ONE_TO_ALL_PRESET_HPP
#include "EnvVars.hpp"
#include <cstddef>
#include <cstdint>
#include <string>
#include <utility>
#include <vector>
void OneToAllPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
[[maybe_unused]] std::string const presetName)
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numDetectedGpus < 2) {
printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n");
exit(1);
}
// Collect env vars for this preset
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
int exeIndex = EnvVars::GetEnvVar("EXE_INDEX", 0);
int sweepDir = EnvVars::GetEnvVar("SWEEP_DIR", 0);
std::string sweepDst = EnvVars::GetEnvVar("SWEEP_DST", "G");
std::string sweepExe = EnvVars::GetEnvVar("SWEEP_EXE", "G");
std::string sweepSrc = EnvVars::GetEnvVar("SWEEP_SRC", "G");
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1);
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", numGpuDevices);
// Display environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) { printf("[One-To-All Related]\n"); }
ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
ev.Print("NUM_GPU_SE", numSubExecs, "Using %d subExecutors/CUs per Transfer", numSubExecs);
ev.Print("EXE_INDEX", exeIndex, "Executing on GPU %d", exeIndex);
ev.Print("SWEEP_DIR", sweepDir, "Direction of transfer");
ev.Print("SWEEP_DST", sweepDst.c_str(), "DST memory types to sweep");
ev.Print("SWEEP_EXE", sweepExe.c_str(), "Executor type to use");
ev.Print("SWEEP_MAX", sweepMax, "Maximum number of peers");
ev.Print("SWEEP_MIN", sweepMin, "Minimum number of peers");
ev.Print("SWEEP_SRC", sweepSrc.c_str(), "SRC memory types to sweep");
printf("\n");
}
// Perform validation
for (auto ch : sweepExe) {
if (ch != 'G' && ch != 'D') {
printf("[ERROR] Unrecognized executor type '%c' specified\n", ch);
exit(1);
}
}
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
char const sep = (ev.outputToCsv ? ',' : ' ');
for (char src : sweepSrc) {
for (char exe : sweepExe) {
for (char dst : sweepDst) {
// Skip invalid configurations
if ((exe == 'D' && (src == 'N' || dst == 'N')) || (src == 'N' && dst == 'N')) {
continue;
}
printf("Executing (%c%s -> %c%d -> %c%s)\n",
src,
src == 'N' ? ""
: (sweepDir == 0 ? std::to_string(exeIndex).c_str()
: std::string("*").c_str()),
exe,
exeIndex,
dst,
dst == 'N' ? ""
: sweepDir == 0 ? std::string("*").c_str()
: std::to_string(exeIndex).c_str());
for (int i = 0; i < numGpuDevices; i++) {
if (i == exeIndex) { continue; }
printf(" GPU %-3d %c", i, sep);
}
printf("\n");
if (!ev.outputToCsv) {
for (int i = 0; i < numGpuDevices - 1; i++) { printf("-------------"); }
printf("\n");
}
for (int p = sweepMin; p <= sweepMax; p++) {
for (int bitmask = 0; bitmask < (1 << numGpuDevices); bitmask++) {
if (bitmask & (1 << exeIndex) || __builtin_popcount(bitmask) != p) {
continue;
}
std::vector<Transfer> transfers;
for (int i = 0; i < numGpuDevices; i++) {
if (bitmask & (1 << i)) {
Transfer t;
CheckForError(
TransferBench::CharToExeType(exe, t.exeDevice.exeType));
t.exeDevice.exeIndex = exeIndex;
t.exeSubIndex = -1;
t.numSubExecs = numSubExecs;
t.numBytes = numBytesPerTransfer;
if (src == 'N') {
t.srcs.clear();
} else {
t.srcs.resize(1);
CheckForError(
TransferBench::CharToMemType(src, t.srcs[0].memType));
t.srcs[0].memIndex = sweepDir == 0 ? exeIndex : i;
}
if (dst == 'N') {
t.dsts.clear();
} else {
t.dsts.resize(1);
CheckForError(
TransferBench::CharToMemType(dst, t.dsts[0].memType));
t.dsts[0].memIndex = sweepDir == 0 ? i : exeIndex;
}
transfers.push_back(t);
}
}
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
int counter = 0;
for (int i = 0; i < numGpuDevices; i++) {
if (bitmask & (1 << i)) {
printf(" %8.3f %c",
results.tfrResults[counter++].avgBandwidthGbPerSec,
sep);
} else if (i != exeIndex) {
printf(" %c", sep);
}
}
printf(" %d %d", p, numSubExecs);
for (auto i = std::size_t(0); i < transfers.size(); i++) {
printf(" (%s %c%d %s)",
MemDevicesToStr(transfers[i].srcs).c_str(),
ExeTypeStr[transfers[i].exeDevice.exeType],
transfers[i].exeDevice.exeIndex,
MemDevicesToStr(transfers[i].dsts).c_str());
}
printf("\n");
}
}
}
}
}
}
#endif // ONE_TO_ALL_PRESET_HPP
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef PEER_TO_PEER_PRESET_HPP
#define PEER_TO_PEER_PRESET_HPP
#include "EnvVars.hpp"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>
void PeerToPeerPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
[[maybe_unused]] std::string const presetName)
{
int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int useDmaCopy = EnvVars::GetEnvVar("USE_GPU_DMA", 0);
int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
int numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE", 4);
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numGpuSubExecs = EnvVars::GetEnvVar(
"NUM_GPU_SE", useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}));
int p2pMode = EnvVars::GetEnvVar("P2P_MODE", 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
// Display environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
int outputToCsv = ev.outputToCsv;
if (!outputToCsv) { printf("[P2P Related]\n"); }
ev.Print("NUM_CPU_DEVICES", numCpuDevices, "Using %d CPUs", numCpuDevices);
ev.Print("NUM_CPU_SE", numCpuSubExecs, "Using %d CPU threads per Transfer", numCpuSubExecs);
ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
ev.Print("NUM_GPU_SE",
numGpuSubExecs,
"Using %d GPU subexecutors/CUs per Transfer",
numGpuSubExecs);
ev.Print("P2P_MODE",
p2pMode,
"Running %s transfers",
p2pMode == 0 ? "Uni + Bi"
: p2pMode == 1 ? "Unidirectional"
: "Bidirectional");
ev.Print("USE_FINE_GRAIN",
useFineGrain,
"Using %s-grained memory",
useFineGrain ? "fine" : "coarse");
ev.Print(
"USE_GPU_DMA", useDmaCopy, "Using GPU-%s as GPU executor", useDmaCopy ? "DMA" : "GFX");
ev.Print("USE_REMOTE_READ",
useRemoteRead,
"Using %s as executor",
useRemoteRead ? "DST" : "SRC");
printf("\n");
}
char const separator = ev.outputToCsv ? ',' : ' ';
printf("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer);
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
// Collect the number of available CPUs/GPUs on this machine
int const numDevices = numCpuDevices + numGpuDevices;
// Perform unidirectional / bidirectional
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) {
if (((p2pMode == 1) && (isBidirectional == 1)) ||
((p2pMode == 2) && (isBidirectional == 0))) {
continue;
}
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n",
isBidirectional ? "Bi" : "Uni",
useRemoteRead ? "Remote" : "Local",
useRemoteRead ? "Local" : "Remote",
useDmaCopy ? "DMA" : "GFX");
// Print header
if (isBidirectional) {
printf("%12s", "SRC\\DST");
} else {
if (useRemoteRead) {
printf("%12s", "SRC\\EXE+DST");
} else {
printf("%12s", "SRC+EXE\\DST");
}
}
if (ev.outputToCsv) { printf(","); }
for (int i = 0; i < numCpuDevices; i++) {
printf("%7s %02d", "CPU", i);
if (ev.outputToCsv) { printf(","); }
}
if (numCpuDevices > 0) { printf(" "); }
for (int i = 0; i < numGpuDevices; i++) {
printf("%7s %02d", "GPU", i);
if (ev.outputToCsv) { printf(","); }
}
printf("\n");
double avgBwSum[2][2] = {};
int avgCount[2][2] = {};
ExeType const gpuExeType = useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
// Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++) {
MemType const srcType = (src < numCpuDevices ? MEM_CPU : MEM_GPU);
int const srcIndex = (srcType == MEM_CPU ? src : src - numCpuDevices);
MemType const srcTypeActual = ((useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE
: (useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE
: srcType);
std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> stdDev(isBidirectional + 1);
if (src == numCpuDevices && src != 0) { printf("\n"); }
for (int dst = 0; dst < numDevices; dst++) {
MemType const dstType = (dst < numCpuDevices ? MEM_CPU : MEM_GPU);
int const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpuDevices);
MemType const dstTypeActual = ((useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE
: (useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE
: dstType);
// Prepare Transfers
std::vector<Transfer> transfers(isBidirectional + 1);
// SRC -> DST
transfers[0].numBytes = numBytesPerTransfer;
transfers[0].srcs.push_back({srcTypeActual, srcIndex});
transfers[0].dsts.push_back({dstTypeActual, dstIndex});
transfers[0].exeDevice = {
IsGpuMemType(useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU,
(useRemoteRead ? dstIndex : srcIndex)};
transfers[0].exeSubIndex = -1;
transfers[0].numSubExecs = (transfers[0].exeDevice.exeType == gpuExeType)
? numGpuSubExecs
: numCpuSubExecs;
// DST -> SRC
if (isBidirectional) {
transfers[1].numBytes = numBytesPerTransfer;
transfers[1].srcs.push_back({dstTypeActual, dstIndex});
transfers[1].dsts.push_back({srcTypeActual, srcIndex});
transfers[1].exeDevice = {
IsGpuMemType(useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU,
(useRemoteRead ? srcIndex : dstIndex)};
transfers[1].exeSubIndex = -1;
transfers[1].numSubExecs = (transfers[1].exeDevice.exeType == gpuExeType)
? numGpuSubExecs
: numCpuSubExecs;
}
bool skipTest = false;
// Abort if executing on NUMA node with no CPUs
for (int i = 0; i <= isBidirectional; i++) {
if (transfers[i].exeDevice.exeType == EXE_CPU &&
TransferBench::GetNumSubExecutors(transfers[i].exeDevice) == 0) {
skipTest = true;
break;
}
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if (transfers[i].exeDevice.exeType == EXE_CPU &&
(IsGpuMemType(srcType) || IsGpuMemType(dstType))) {
skipTest = true;
break;
}
#endif
}
if (isBidirectional && srcType == dstType && srcIndex == dstIndex) {
skipTest = true;
}
if (!skipTest) {
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults) {
printf("%s\n", err.errMsg.c_str());
}
exit(1);
}
for (int dir = 0; dir <= isBidirectional; dir++) {
double const avgBw = results.tfrResults[dir].avgBandwidthGbPerSec;
avgBandwidth[dir].push_back(avgBw);
if (!(srcType == dstType && srcIndex == dstIndex)) {
avgBwSum[srcType][dstType] += avgBw;
avgCount[srcType][dstType]++;
}
if (ev.showIterations) {
double minTime = results.tfrResults[dir].perIterMsec[0];
double maxTime = minTime;
double varSum = 0;
for (auto i = std::size_t(0);
i < results.tfrResults[dir].perIterMsec.size();
i++) {
minTime = std::min(minTime, results.tfrResults[dir].perIterMsec[i]);
maxTime = std::max(maxTime, results.tfrResults[dir].perIterMsec[i]);
double const bw = (transfers[dir].numBytes / 1.0E9) /
results.tfrResults[dir].perIterMsec[i] * 1000.0f;
double const delta = (avgBw - bw);
varSum += delta * delta;
}
double const minBw = (transfers[dir].numBytes / 1.0E9) / maxTime *
1000.0f;
double const maxBw = (transfers[dir].numBytes / 1.0E9) / minTime *
1000.0f;
double const stdev = sqrt(varSum /
results.tfrResults[dir].perIterMsec.size());
minBandwidth[dir].push_back(minBw);
maxBandwidth[dir].push_back(maxBw);
stdDev[dir].push_back(stdev);
}
}
} else {
for (int dir = 0; dir <= isBidirectional; dir++) {
avgBandwidth[dir].push_back(0);
minBandwidth[dir].push_back(0);
maxBandwidth[dir].push_back(0);
stdDev[dir].push_back(-1.0);
}
}
}
for (int dir = 0; dir <= isBidirectional; dir++) {
printf("%5s %02d %3s",
(srcType == MEM_CPU) ? "CPU" : "GPU",
srcIndex,
dir ? "<- " : " ->");
if (ev.outputToCsv) { printf(","); }
for (int dst = 0; dst < numDevices; dst++) {
if (dst == numCpuDevices && dst != 0) { printf(" "); }
double const avgBw = avgBandwidth[dir][dst];
if (avgBw == 0.0) {
printf("%10s", "N/A");
} else {
printf("%10.2f", avgBw);
}
if (ev.outputToCsv) { printf(","); }
}
printf("\n");
if (ev.showIterations) {
// minBw
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
if (ev.outputToCsv) { printf(","); }
for (int i = 0; i < numDevices; i++) {
double const minBw = minBandwidth[dir][i];
if (i == numCpuDevices && i != 0) { printf(" "); }
if (minBw == 0.0) {
printf("%10s", "N/A");
} else {
printf("%10.2f", minBw);
}
if (ev.outputToCsv) { printf(","); }
}
printf("\n");
// maxBw
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
if (ev.outputToCsv) { printf(","); }
for (int i = 0; i < numDevices; i++) {
double const maxBw = maxBandwidth[dir][i];
if (i == numCpuDevices && i != 0) { printf(" "); }
if (maxBw == 0.0) {
printf("%10s", "N/A");
} else {
printf("%10.2f", maxBw);
}
if (ev.outputToCsv) { printf(","); }
}
printf("\n");
// stddev
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
if (ev.outputToCsv) { printf(","); }
for (int i = 0; i < numDevices; i++) {
double const sd = stdDev[dir][i];
if (i == numCpuDevices && i != 0) { printf(" "); }
if (sd == -1.0) {
printf("%10s", "N/A");
} else {
printf("%10.2f", sd);
}
if (ev.outputToCsv) { printf(","); }
}
printf("\n");
}
fflush(stdout);
}
if (isBidirectional) {
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
if (ev.outputToCsv) { printf(","); }
for (int dst = 0; dst < numDevices; dst++) {
double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
if (dst == numCpuDevices && dst != 0) { printf(" "); }
if (sumBw == 0.0) {
printf("%10s", "N/A");
} else {
printf("%10.2f", sumBw);
}
if (ev.outputToCsv) { printf(","); }
}
printf("\n");
if (src < numDevices - 1) { printf("\n"); }
}
}
if (!ev.outputToCsv) {
printf(" ");
for (int srcType : {MEM_CPU, MEM_GPU}) {
for (int dstType : {MEM_CPU, MEM_GPU}) {
printf(" %cPU->%cPU",
srcType == MEM_CPU ? 'C' : 'G',
dstType == MEM_CPU ? 'C' : 'G');
}
}
printf("\n");
printf("Averages (During %s):", isBidirectional ? " BiDir" : "UniDir");
for (int srcType : {MEM_CPU, MEM_GPU}) {
for (int dstType : {MEM_CPU, MEM_GPU}) {
if (avgCount[srcType][dstType]) {
printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
} else {
printf("%10s", "N/A");
}
}
}
printf("\n\n");
}
}
}
#endif // PEER_TO_PEER_PRESET_HPP
...@@ -32,43 +32,49 @@ THE SOFTWARE. ...@@ -32,43 +32,49 @@ THE SOFTWARE.
#include "Scaling.hpp" #include "Scaling.hpp"
#include "Schmoo.hpp" #include "Schmoo.hpp"
#include "Sweep.hpp" #include "Sweep.hpp"
#include <map> #include <map>
#include <utility>
typedef void (*PresetFunc)(EnvVars& ev, typedef void (*PresetFunc)(EnvVars& ev,
size_t const numBytesPerTransfer, size_t const numBytesPerTransfer,
std::string const presetName); std::string const presetName);
std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap = std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap = {
{ {"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}},
{"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}}, {"a2a_n",
{"a2a_n", {AllToAllRdmaPreset, "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}}, {AllToAllRdmaPreset,
{"a2asweep", {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}}, "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA "
{"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}}, "transfers"}},
{"one2all", {OneToAllPreset, "Test all subsets of parallel transfers from one GPU to all others"}}, {"a2asweep",
{"p2p" , {PeerToPeerPreset, " Peer-to-peer device memory bandwidth test"}}, {AllToAllSweepPreset,
{"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}}, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
{"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}}, {"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}},
{"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}}, {"one2all",
{"sweep", {SweepPreset, "Ordered sweep through sets of Transfers"}}, {OneToAllPreset, "Test all subsets of parallel transfers from one GPU to all others"}},
{"p2p", {PeerToPeerPreset, " Peer-to-peer device memory bandwidth test"}},
{"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}},
{"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}},
{"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}},
{"sweep", {SweepPreset, "Ordered sweep through sets of Transfers"}},
}; };
void DisplayPresets() void DisplayPresets()
{ {
printf("\nAvailable Preset Benchmarks:\n"); printf("\nAvailable Preset Benchmarks:\n");
printf("============================\n"); printf("============================\n");
for (auto const& x : presetFuncMap) for (auto const& x : presetFuncMap) {
printf(" %15s - %s\n", x.first.c_str(), x.second.second.c_str()); printf(" %15s - %s\n", x.first.c_str(), x.second.second.c_str());
}
} }
int RunPreset(EnvVars& ev, int RunPreset(EnvVars& ev, size_t const numBytesPerTransfer, int const argc, char** const argv)
size_t const numBytesPerTransfer,
int const argc,
char** const argv)
{ {
std::string preset = (argc > 1 ? argv[1] : ""); std::string preset = (argc > 1 ? argv[1] : "");
if (presetFuncMap.count(preset)) { if (presetFuncMap.count(preset)) {
(presetFuncMap[preset].first)(ev, numBytesPerTransfer, preset); (presetFuncMap[preset].first)(ev, numBytesPerTransfer, preset);
return 1; return 1;
} }
return 0; return 0;
} }
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef SCALING_PRESET_HPP
#define SCALING_PRESET_HPP
#include "EnvVars.hpp"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>
void ScalingPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
[[maybe_unused]] std::string const presetName)
{
int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int localIdx = EnvVars::GetEnvVar("LOCAL_IDX", 0);
int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", 32);
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
// Display environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
int outputToCsv = ev.outputToCsv;
if (!outputToCsv) { printf("[Schmoo Related]\n"); }
ev.Print("LOCAL_IDX", localIdx, "Local GPU index");
ev.Print("SWEEP_MAX", sweepMax, "Max number of subExecutors to use");
ev.Print("SWEEP_MIN", sweepMin, "Min number of subExecutors to use");
printf("\n");
}
// Validate env vars
if (localIdx >= numDetectedGpus) {
printf("[ERROR] Cannot execute scaling test with local GPU device %d\n", localIdx);
exit(1);
}
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
char separator = (ev.outputToCsv ? ',' : ' ');
int numDevices = numCpuDevices + numGpuDevices;
printf("GPU-GFX Scaling benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes from GPU %d to other devices\n", numBytesPerTransfer, localIdx);
printf("- All numbers reported as GB/sec\n\n");
printf("NumCUs");
for (int i = 0; i < numDevices; i++) {
printf("%c %s%02d ",
separator,
i < numCpuDevices ? "CPU" : "GPU",
i < numCpuDevices ? i : i - numCpuDevices);
}
printf("\n");
std::vector<std::pair<double, int>> bestResult(numDevices);
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, localIdx};
t.exeSubIndex = -1;
t.numBytes = numBytesPerTransfer;
t.srcs = {{MEM_GPU, localIdx}};
for (int numSubExec = sweepMin; numSubExec <= sweepMax; numSubExec++) {
t.numSubExecs = numSubExec;
printf("%4d ", numSubExec);
for (int i = 0; i < numDevices; i++) {
t.dsts = {
{i < numCpuDevices ? MEM_CPU : MEM_GPU, i < numCpuDevices ? i : i - numCpuDevices}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double bw = results.tfrResults[0].avgBandwidthGbPerSec;
printf("%c%7.2f ", separator, bw);
if (bw > bestResult[i].first) {
bestResult[i].first = bw;
bestResult[i].second = numSubExec;
}
}
printf("\n");
}
printf(" Best ");
for (int i = 0; i < numDevices; i++) {
printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
}
printf("\n");
}
#endif // SCALING_PRESET_HPP
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef SCHMOO_PRESET_HPP
#define SCHMOO_PRESET_HPP
#include "EnvVars.hpp"
#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>
void SchmooPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
[[maybe_unused]] std::string const presetName)
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numDetectedGpus < 2) {
printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
exit(1);
}
// Collect env vars for this preset
int localIdx = EnvVars::GetEnvVar("LOCAL_IDX", 0);
int remoteIdx = EnvVars::GetEnvVar("REMOTE_IDX", 1);
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", 32);
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
// Display environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
int outputToCsv = ev.outputToCsv;
if (!outputToCsv) { printf("[Schmoo Related]\n"); }
ev.Print("LOCAL_IDX", localIdx, "Local GPU index");
ev.Print("REMOTE_IDX", remoteIdx, "Remote GPU index");
ev.Print("SWEEP_MAX", sweepMax, "Max number of subExecutors to use");
ev.Print("SWEEP_MIN", sweepMin, "Min number of subExecutors to use");
ev.Print("USE_FINE_GRAIN",
useFineGrain,
"Using %s-grained memory",
useFineGrain ? "fine" : "coarse");
printf("\n");
}
// Validate env vars
if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) {
printf(
"[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n",
localIdx,
remoteIdx);
exit(1);
}
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
char memChar = useFineGrain ? 'F' : 'G';
printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n",
numBytesPerTransfer,
localIdx,
remoteIdx);
printf(
" | Local Read | Local Write | Local Copy | Remote Read | Remote Write| Remote "
"Copy |\n");
printf(
" #CUs "
"|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%"
"02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
memChar,
localIdx,
localIdx,
localIdx,
memChar,
localIdx,
memChar,
localIdx,
localIdx,
memChar,
localIdx,
memChar,
remoteIdx,
localIdx,
localIdx,
memChar,
remoteIdx,
memChar,
localIdx,
localIdx,
memChar,
remoteIdx);
printf(
"|------|-------------|-------------|-------------|-------------|-------------|------------"
"-|\n");
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, localIdx};
t.exeSubIndex = -1;
t.numBytes = numBytesPerTransfer;
MemType memType = (useFineGrain ? MEM_GPU_FINE : MEM_GPU);
for (int numCUs = sweepMin; numCUs <= sweepMax; numCUs++) {
t.numSubExecs = numCUs;
// Local Read
t.srcs = {{memType, localIdx}};
t.dsts = {};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const localRead = results.tfrResults[0].avgBandwidthGbPerSec;
// Local Write
t.srcs = {};
t.dsts = {{memType, localIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec;
// Local Copy
t.srcs = {{memType, localIdx}};
t.dsts = {{memType, localIdx}};
t.srcs = {};
t.dsts = {{memType, localIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Read
t.srcs = {{memType, remoteIdx}};
t.dsts = {};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Write
t.srcs = {};
t.dsts = {{memType, remoteIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Copy
t.srcs = {{memType, localIdx}};
t.dsts = {{memType, remoteIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec;
printf(" %3d %11.3f %11.3f %11.3f %11.3f %11.3f %11.3f \n",
numCUs,
localRead,
localWrite,
localCopy,
remoteRead,
remoteWrite,
remoteCopy);
}
}
#endif // SCHMOO_PRESET_HPP
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef SWEEP_PRESET_HPP
#define SWEEP_PRESET_HPP
#include "EnvVars.hpp"
#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>
void LogTransfers(FILE* fp, int const testNum, std::vector<Transfer> const& transfers)
{
if (fp) {
fprintf(fp, "# Test %d\n", testNum);
fprintf(fp, "%d", -1 * (int)transfers.size());
for (auto const& transfer : transfers) {
fprintf(fp,
" (%s->%c%d->%s %d %lu)",
MemDevicesToStr(transfer.srcs).c_str(),
ExeTypeStr[transfer.exeDevice.exeType],
transfer.exeDevice.exeIndex,
MemDevicesToStr(transfer.dsts).c_str(),
transfer.numSubExecs,
transfer.numBytes);
}
fprintf(fp, "\n");
fflush(fp);
}
}
void SweepPreset(EnvVars& ev, size_t const numBytesPerTransfer, std::string const presetName)
{
bool const isRandom = (presetName == "rsweep");
int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars and set defaults
int continueOnErr = EnvVars::GetEnvVar("CONTINUE_ON_ERROR", 0);
int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
int numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE", 4);
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
std::string sweepDst = EnvVars::GetEnvVar("SWEEP_DST", "CG");
std::string sweepExe = EnvVars::GetEnvVar("SWEEP_EXE", "CDG");
std::string sweepFile = EnvVars::GetEnvVar("SWEEP_FILE", "/tmp/lastSweep.cfg");
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", 24);
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1);
int sweepRandBytes = EnvVars::GetEnvVar("SWEEP_RAND_BYTES", 0);
int sweepSeed = EnvVars::GetEnvVar("SWEEP_SEED", time(NULL));
std::string sweepSrc = EnvVars::GetEnvVar("SWEEP_SRC", "CG");
int sweepTestLimit = EnvVars::GetEnvVar("SWEEP_TEST_LIMIT", 0);
int sweepTimeLimit = EnvVars::GetEnvVar("SWEEP_TIME_LIMIT", 0);
int sweepXgmiMin = EnvVars::GetEnvVar("SWEEP_XGMI_MIN", 0);
int sweepXgmiMax = EnvVars::GetEnvVar("SWEEP_XGMI_MAX", -1);
auto generator = new std::default_random_engine(sweepSeed);
// Display env var settings
ev.DisplayEnvVars();
if (!ev.hideEnv) {
int outputToCsv = ev.outputToCsv;
if (!outputToCsv) { printf("[Sweep Related]\n"); }
ev.Print("CONTINUE_ON_ERROR",
continueOnErr,
continueOnErr ? "Continue on mismatch error" : "Stop after first error");
ev.Print("NUM_CPU_DEVICES", numCpuDevices, "Using %d CPUs", numCpuDevices);
ev.Print("NUM_CPU_SE",
numCpuSubExecs,
"Using %d CPU threads per CPU executed Transfer",
numCpuSubExecs);
ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
ev.Print("NUM_GPU_SE",
numGpuSubExecs,
"Using %d subExecutors/CUs per GPU executed Transfer",
numGpuSubExecs);
ev.Print("SWEEP_DST", sweepDst.c_str(), "Destination Memory Types to sweep");
ev.Print("SWEEP_EXE", sweepExe.c_str(), "Executor Types to sweep");
ev.Print(
"SWEEP_FILE", sweepFile.c_str(), "File to store the executing sweep configuration");
ev.Print("SWEEP_MAX", sweepMax, "Max simultaneous transfers (0 = no limit)");
ev.Print("SWEEP_MIN", sweepMin, "Min simultaenous transfers");
ev.Print("SWEEP_RAND_BYTES",
sweepRandBytes,
"Using %s number of bytes per Transfer",
(sweepRandBytes ? "random" : "constant"));
ev.Print("SWEEP_SEED", sweepSeed, "Random seed set to %d", sweepSeed);
ev.Print("SWEEP_SRC", sweepSrc.c_str(), "Source Memory Types to sweep");
ev.Print("SWEEP_TEST_LIMIT",
sweepTestLimit,
"Max number of tests to run during sweep (0 = no limit)");
ev.Print("SWEEP_TIME_LIMIT",
sweepTimeLimit,
"Max number of seconds to run sweep for (0 = no limit)");
ev.Print("SWEEP_XGMI_MAX",
sweepXgmiMax,
"Max number of XGMI hops for Transfers (-1 = no limit)");
ev.Print("SWEEP_XGMI_MIN", sweepXgmiMin, "Min number of XGMI hops for Transfers");
printf("\n");
}
// Validate env vars
for (auto ch : sweepSrc) {
if (!strchr(MemTypeStr, ch)) {
printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
exit(1);
}
if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) {
printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
exit(1);
}
}
for (auto ch : sweepDst) {
if (!strchr(MemTypeStr, ch)) {
printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
exit(1);
}
if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) {
printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
exit(1);
}
}
for (auto ch : sweepExe) {
if (!strchr(ExeTypeStr, ch)) {
printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
exit(1);
}
if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) {
printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
exit(1);
}
}
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
// Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
std::vector<ExeDevice> exeList;
for (auto exe : sweepExe) {
ExeType exeType;
CharToExeType(exe, exeType);
if (IsGpuExeType(exeType)) {
for (int exeIndex = 0; exeIndex < numGpuDevices; ++exeIndex) {
exeList.push_back({exeType, exeIndex});
}
} else if (IsCpuExeType(exeType)) {
for (int exeIndex = 0; exeIndex < numCpuDevices; ++exeIndex) {
// Skip NUMA nodes that have no CPUs (e.g. CXL)
if (TransferBench::GetNumSubExecutors({EXE_CPU, exeIndex}) == 0) { continue; }
exeList.push_back({exeType, exeIndex});
}
}
}
int numExes = exeList.size();
std::vector<MemDevice> srcList;
for (auto src : sweepSrc) {
MemType srcType;
CharToMemType(src, srcType);
int const numDevices = (srcType == MEM_NULL) ? 1
: IsGpuMemType(srcType) ? numGpuDevices
: numCpuDevices;
for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex) {
srcList.push_back({srcType, srcIndex});
}
}
int numSrcs = srcList.size();
std::vector<MemDevice> dstList;
for (auto dst : sweepDst) {
MemType dstType;
CharToMemType(dst, dstType);
int const numDevices = (dstType == MEM_NULL) ? 1
: IsGpuMemType(dstType) ? numGpuDevices
: numCpuDevices;
for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex) {
dstList.push_back({dstType, dstIndex});
}
}
int numDsts = dstList.size();
// Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
struct TransferInfo
{
MemDevice srcMem;
ExeDevice exeDevice;
MemDevice dstMem;
};
// If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links
// must be XGMI
bool const useXgmiOnly = (sweepXgmiMin > 0 || sweepXgmiMax > 0);
std::vector<TransferInfo> possibleTransfers;
TransferInfo tinfo;
for (int i = 0; i < numExes; ++i) {
// Skip CPU executors if XGMI link must be used
if (useXgmiOnly && !IsGpuExeType(exeList[i].exeType)) { continue; }
tinfo.exeDevice = exeList[i];
bool isXgmiSrc = false;
int numHopsSrc = 0;
for (int j = 0; j < numSrcs; ++j) {
if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(srcList[j].memType)) {
if (exeList[i].exeIndex != srcList[j].memIndex) {
#if defined(__NVCC__)
isXgmiSrc = false;
#else
uint32_t exeToSrcLinkType, exeToSrcHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
srcList[j].memIndex,
&exeToSrcLinkType,
&exeToSrcHopCount));
isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiSrc) { numHopsSrc = exeToSrcHopCount; }
#endif
} else {
isXgmiSrc = true;
numHopsSrc = 0;
}
// Skip this SRC if it is not XGMI but only XGMI links may be used
if (useXgmiOnly && !isXgmiSrc) { continue; }
// Skip this SRC if XGMI distance is already past limit
if (sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > sweepXgmiMax) { continue; }
} else if (srcList[j].memType != MEM_NULL && useXgmiOnly) {
continue;
}
tinfo.srcMem = srcList[j];
bool isXgmiDst = false;
int numHopsDst = 0;
for (int k = 0; k < numDsts; ++k) {
if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(dstList[k].memType)) {
if (exeList[i].exeIndex != dstList[k].memIndex) {
#if defined(__NVCC__)
isXgmiSrc = false;
#else
uint32_t exeToDstLinkType, exeToDstHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
dstList[k].memIndex,
&exeToDstLinkType,
&exeToDstHopCount));
isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiDst) { numHopsDst = exeToDstHopCount; }
#endif
} else {
isXgmiDst = true;
numHopsDst = 0;
}
}
// Skip this DST if it is not XGMI but only XGMI links may be used
if (dstList[k].memType != MEM_NULL && useXgmiOnly && !isXgmiDst) { continue; }
// Skip this DST if total XGMI distance (SRC + DST) is less than min limit
if (sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < sweepXgmiMin)) { continue; }
// Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
if (sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > sweepXgmiMax) { continue; }
#if defined(__NVCC__)
// Skip CPU executors on GPU memory on NVIDIA platform
if (IsCpuExeType(exeList[i].exeType) &&
(IsGpuMemType(dstList[j].memType) || IsGpuMemType(dstList[k].memType))) {
continue;
}
#endif
tinfo.dstMem = dstList[k];
// Skip if there is no src and dst
if (tinfo.srcMem.memType == MEM_NULL && tinfo.dstMem.memType == MEM_NULL) {
continue;
}
possibleTransfers.push_back(tinfo);
}
}
}
int const numPossible = (int)possibleTransfers.size();
int maxParallelTransfers = (sweepMax == 0 ? numPossible : sweepMax);
if (sweepMin > numPossible) {
printf("No valid test configurations exist\n");
return;
}
if (ev.outputToCsv) {
printf(
"\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
"ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
}
int numTestsRun = 0;
int M = sweepMin;
std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
std::uniform_int_distribution<int> distribution(sweepMin, maxParallelTransfers);
// Log sweep to configuration file
char absPath[1024];
auto const res = realpath(sweepFile.c_str(), absPath);
FILE* fp = fopen(sweepFile.c_str(), "w");
if (!fp) {
printf("[WARN] Unable to open %s. Skipping output of sweep configuration file\n",
res ? absPath : sweepFile.c_str());
} else {
printf("Sweep configuration saved to: %s\n", res ? absPath : sweepFile.c_str());
}
// Create bitmask of numPossible triplets, of which M will be chosen
std::string bitmask(M, 1);
bitmask.resize(numPossible, 0);
auto cpuStart = std::chrono::high_resolution_clock::now();
while (1) {
if (isRandom) {
// Pick random number of simultaneous transfers to execute
// NOTE: This currently skews distribution due to some #s having more possibilities than
// others
M = distribution(*generator);
// Generate a random bitmask
for (int i = 0; i < numPossible; i++) { bitmask[i] = (i < M) ? 1 : 0; }
std::shuffle(bitmask.begin(), bitmask.end(), *generator);
}
// Convert bitmask to list of Transfers
std::vector<Transfer> transfers;
for (int value = 0; value < numPossible; ++value) {
if (bitmask[value]) {
// Convert integer value to (SRC->EXE->DST) triplet
Transfer transfer;
if (possibleTransfers[value].srcMem.memType != MEM_NULL) {
transfer.srcs.push_back(possibleTransfers[value].srcMem);
}
transfer.exeDevice = possibleTransfers[value].exeDevice;
if (possibleTransfers[value].dstMem.memType != MEM_NULL) {
transfer.dsts.push_back(possibleTransfers[value].dstMem);
}
transfer.exeSubIndex = -1;
transfer.numSubExecs = IsGpuExeType(transfer.exeDevice.exeType) ? numGpuSubExecs
: numCpuSubExecs;
transfer.numBytes = sweepRandBytes ? randSize(*generator) * sizeof(float)
: numBytesPerTransfer;
transfers.push_back(transfer);
}
}
LogTransfers(fp, ++numTestsRun, transfers);
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
if (!continueOnErr) { exit(1); }
} else {
PrintResults(ev, numTestsRun, transfers, results);
}
// Check for test limit
if (numTestsRun == sweepTestLimit) {
printf("Sweep Test limit reached\n");
break;
}
// Check for time limit
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
.count();
if (sweepTimeLimit && totalCpuTime > sweepTimeLimit) {
printf("Sweep Time limit exceeded\n");
break;
}
// Increment bitmask if not random sweep
if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end())) {
M++;
// Check for completion
if (M > maxParallelTransfers) {
printf("Sweep complete\n");
break;
}
for (int i = 0; i < numPossible; i++) { bitmask[i] = (i < M) ? 1 : 0; }
}
}
if (fp) { fclose(fp); }
}
#endif // SWEEP_PRESET_HPP
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment