Unverified Commit 513ce1e3 authored by Swati Rawat's avatar Swati Rawat Committed by GitHub
Browse files

Merge branch 'ROCm:develop' into swraw/docs

parents ebad0b36 fb713d03
......@@ -6,3 +6,4 @@ _static/
_templates/
_toc.yml
docBin/
TransferBench
......@@ -3,6 +3,59 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.58.00
### Fixed
- Fixed broken specific DMA-engine copies
## v1.57.01
### Added
- Re-added "scaling" GPU GFX preset benchmark, which tests copies from GPU to other devices using varying
number of CUs.
## v1.57.00
### Modified
- Removing use of default starship operator / C++20 requirement to enable compilation of more OSs
- Changing how version is reported. Client version is now just last two digits, and increments only if
no changes are made to the backend header-only library file, and resets to 0 when header is updated
- GFX_SINGLE_TEAM=0 is set by default
## v1.56
### Fixed
- Fixed bug when using interactive mode. Interactive mode now starts prior to all warmup iterations
## v1.55
### Fixed
- Fixed missing header error when compiling on CentOS
- Fixed issues when using multi-stream mode for GFX executor
## v1.54
### Modified
- Refactored TransferBench into a header-only library combined with a thin client to facilitate the
use of TransferBench as the backend for other applications
- Optimized how data validation is handled - this should speed up Tests with many parallel transfers as data is only
generated once
- Preset benchmarks now no longer take in any extra command line arguments. Preset settings are only accessed via
environment variables. Details for each preset are printed
- The a2a preset benchmark now defaults to using fine-grained memory and GFX unroll of 2
- Refactored how Transfers are launched in parallel which has reduced some CPU-side overheads
- CPU and DMA executor timing now use CPU wall clock timing instead of slowest Transfer time
### Added
- New one2all preset which sweeps over all subests of parallel transfers from one GPU to others
- Adding new warnings for DMA execution relating to how HIP will default to using agents from the source memory
### Removed
- CU scaling preset has been removed. Similar functionality already exists in the schmoo preset benchmark
- Preparation of source data via GFX kernel has been removed (USE_PREP_KERNEL)
- Removed GFX block-reordering (BLOCK_ORDER)
- Removed NUM_CPU_DEVICES and NUM_GPU_DEVICES from common env vars and only into the presets they apply to.
- Removed SHARED_MEM_BYTES option for GFX executor
- Removed USE_PCIE_INDEX, and SHARED_MEM_BYTES
### Fixed
- Fixed a potential timing reporting issue when DMA executed Transfers end up getting serialized.
## v1.53
### Added
- Added ability to specify NULL for sweep preset as source or destination memory type
## v1.52
### Added
- Added USE_HSA_DMA env var to switch to using hsa_amd_memory_async_copy instead of hipMemcpyAsync for DMA execution
......
# Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
else()
......@@ -6,13 +7,59 @@ else()
endif()
cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.51.0 LANGUAGES CXX)
project(TransferBench VERSION 1.58.00 LANGUAGES CXX)
# Default GPU architectures to build
#==================================================================================================
set(DEFAULT_GPUS
gfx906
gfx908
gfx90a
gfx942
gfx1030
gfx1100
gfx1101
gfx1102
gfx1200
gfx1201)
# Build only for local GPU architecture
if (BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target")
if (COMMAND rocm_local_targets)
rocm_local_targets(DEFAULT_GPUS)
else()
message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
endif()
endif()
# Determine which GPU architectures to build for
set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
# Check if clang compiler can offload to GPU_TARGETS
if (COMMAND rocm_check_target_ids)
message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
else()
message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.")
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif()
set(COMPILING_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.")
message(STATUS "Compiling for ${COMPILING_TARGETS}")
foreach(target ${COMPILING_TARGETS})
list(APPEND static_link_flags --offload-arch=${target})
endforeach()
list(JOIN static_link_flags " " flags_str)
set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
include_directories(${ROCM_PATH}/include)
link_libraries(numa hsa-runtime64 pthread)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..)
add_executable(TransferBench src/TransferBench.cpp)
target_include_directories(TransferBench PRIVATE src/include)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp)
target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)
find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
include(ROCMInstallTargets)
......
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
#
all:
cd src ; make
# Configuration options
ROCM_PATH ?= /opt/rocm
CUDA_PATH ?= /usr/local/cuda
HIPCC=$(ROCM_PATH)/bin/hipcc
NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda
else
EXE=TransferBench
endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread
all: $(EXE)
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
clean:
cd src ; make clean
rm -f *.o ./TransferBench ./TransferBenchCuda
......@@ -8,8 +8,8 @@ import re
from rocm_docs import ROCmDocs
with open('../src/include/EnvVars.hpp', encoding='utf-8') as f:
match = re.search(r'#define TB_VERSION "([0-9.]+)[^0-9.]+', f.read())
with open('../src/header/TransferBench.hpp', encoding='utf-8') as f:
match = re.search(r'constexpr char VERSION\[\] = "([0-9.]+)[^0-9.]+', f.read())
if not match:
raise ValueError("VERSION not found!")
version_number = match[1]
......@@ -18,7 +18,7 @@ left_nav_title = f"TransferBench {version_number} Documentation"
# for PDF output on Read the Docs
project = "TransferBench Documentation"
author = "Advanced Micro Devices, Inc."
copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
version = version_number
release = version_number
......
......@@ -47,7 +47,7 @@ To build documentation locally, use:
.. code-block:: bash
cd docs
pip3 install -r .sphinx/requirements.txt
pip3 install -r ./sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
NVIDIA platform support
......
rocm-docs-core==1.8.3
rocm-docs-core==1.9.2
......@@ -92,7 +92,7 @@ requests==2.32.2
# via
# pygithub
# sphinx
rocm-docs-core==1.8.3
rocm-docs-core==1.9.2
# via -r requirements.in
smmap==5.0.1
# via gitdb
......
# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
ROCM_PATH ?= /opt/rocm
CUDA_PATH ?= /usr/local/cuda
HIPCC=$(ROCM_PATH)/bin/hipcc
NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=../TransferBenchCuda
else
EXE=../TransferBench
endif
CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -O3 -Iinclude -x cu -lnuma -arch=native
LDFLAGS += -lpthread
all: $(EXE)
../TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp")
$(HIPCC) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
../TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp")
$(NVCC) $(NVFLAGS) $< -o $@ $(LDFLAGS)
clean:
rm -f *.o ../TransferBench ../TransferBenchCuda
/*
Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
// This program measures simultaneous copy performance across multiple GPUs
// on the same node
#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
#include <cmath> // If not found, try installing g++-12 (e.g apt-get install g++-12)
#include <numaif.h>
#include <random>
#include <stack>
#include <thread>
#include "TransferBench.hpp"
#include "GetClosestNumaNode.hpp"
int main(int argc, char **argv)
{
// Check for NUMA library support
if (numa_available() == -1)
{
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
// Display usage instructions and detected topology
if (argc <= 1)
{
int const outputToCsv = EnvVars::GetEnvVar("OUTPUT_TO_CSV", 0);
if (!outputToCsv) DisplayUsage(argv[0]);
DisplayTopology(outputToCsv);
exit(0);
}
// Collect environment variables / display current run configuration
EnvVars ev;
// Determine number of bytes to run per Transfer
size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
if (argc > 2)
{
// Adjust bytes if unit specified
char units = argv[2][strlen(argv[2])-1];
switch (units)
{
case 'K': case 'k': numBytesPerTransfer *= 1024; break;
case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
}
}
if (numBytesPerTransfer % 4)
{
printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
exit(1);
}
// Check for preset tests
// - Tests that sweep across possible sets of Transfers
if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
{
int numGpuSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
int numCpuSubExecs = (argc > 4 ? atoi(argv[4]) : 4);
ev.configMode = CFG_SWEEP;
RunSweepPreset(ev, numBytesPerTransfer, numGpuSubExecs, numCpuSubExecs, !strcmp(argv[1], "rsweep"));
exit(0);
}
// - Tests that benchmark peer-to-peer performance
else if (!strcmp(argv[1], "p2p"))
{
ev.configMode = CFG_P2P;
RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float));
exit(0);
}
// - Test SubExecutor scaling
else if (!strcmp(argv[1], "scaling"))
{
int maxSubExecs = (argc > 3 ? atoi(argv[3]) : 32);
int exeIndex = (argc > 4 ? atoi(argv[4]) : 0);
if (exeIndex >= ev.numGpuDevices)
{
printf("[ERROR] Cannot execute scaling test with GPU device %d\n", exeIndex);
exit(1);
}
ev.configMode = CFG_SCALE;
RunScalingBenchmark(ev, numBytesPerTransfer / sizeof(float), exeIndex, maxSubExecs);
exit(0);
}
// - Test all2all benchmark
else if (!strcmp(argv[1], "a2a"))
{
int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
ev.configMode = CFG_A2A;
RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
exit(0);
}
// Health check
else if (!strcmp(argv[1], "healthcheck")) {
RunHealthCheck(ev);
exit(0);
}
// - Test schmoo benchmark
else if (!strcmp(argv[1], "schmoo"))
{
if (ev.numGpuDevices < 2)
{
printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
exit(1);
}
ev.configMode = CFG_SCHMOO;
int localIdx = (argc > 3 ? atoi(argv[3]) : 0);
int remoteIdx = (argc > 4 ? atoi(argv[4]) : 1);
int maxSubExecs = (argc > 5 ? atoi(argv[5]) : 32);
if (localIdx >= ev.numGpuDevices || remoteIdx >= ev.numGpuDevices)
{
printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
exit(1);
}
ev.DisplaySchmooEnvVars();
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
do
{
RunSchmooBenchmark(ev, curr * sizeof(float), localIdx, remoteIdx, maxSubExecs);
if (numBytesPerTransfer != 0) exit(0);
curr += delta;
} while (curr < N * 2);
}
}
else if (!strcmp(argv[1], "rwrite"))
{
if (ev.numGpuDevices < 2)
{
printf("[ERROR] Remote write benchmark requires at least 2 GPUs\n");
exit(1);
}
ev.DisplayRemoteWriteEnvVars();
int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
int srcIdx = (argc > 4 ? atoi(argv[4]) : 0);
int minGpus = (argc > 5 ? atoi(argv[5]) : 1);
int maxGpus = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
do
{
RunRemoteWriteBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
if (numBytesPerTransfer != 0) exit(0);
curr += delta;
} while (curr < N * 2);
}
}
else if (!strcmp(argv[1], "pcopy"))
{
if (ev.numGpuDevices < 2)
{
printf("[ERROR] Parallel copy benchmark requires at least 2 GPUs\n");
exit(1);
}
ev.DisplayParallelCopyEnvVars();
int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8);
int srcIdx = (argc > 4 ? atoi(argv[4]) : 0);
int minGpus = (argc > 5 ? atoi(argv[5]) : 1);
int maxGpus = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
if (maxGpus > ev.gpuMaxHwQueues && ev.useDmaCopy)
{
printf("[ERROR] DMA executor %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
srcIdx, maxGpus, ev.gpuMaxHwQueues);
printf("[ERROR] Aborting to avoid misleading results due to potential serialization of Transfers\n");
exit(1);
}
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
do
{
RunParallelCopyBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
if (numBytesPerTransfer != 0) exit(0);
curr += delta;
} while (curr < N * 2);
}
}
else if (!strcmp(argv[1], "cmdline"))
{
// Print environment variables
ev.DisplayEnvVars();
// Read Transfer from command line
std::string cmdlineTransfer;
for (int i = 3; i < argc; i++)
cmdlineTransfer += std::string(argv[i]) + " ";
char line[MAX_LINE_LEN];
sprintf(line, "%s", cmdlineTransfer.c_str());
std::vector<Transfer> transfers;
ParseTransfers(ev, line, transfers);
if (transfers.empty()) exit(0);
// If the number of bytes is specified, use it
if (numBytesPerTransfer != 0)
{
size_t N = numBytesPerTransfer / sizeof(float);
ExecuteTransfers(ev, 1, N, transfers);
}
else
{
// Otherwise generate a range of values
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = N;
while (curr < N * 2)
{
ExecuteTransfers(ev, 1, curr, transfers);
curr += delta;
}
}
}
exit(0);
}
// Check that Transfer configuration file can be opened
ev.configMode = CFG_FILE;
FILE* fp = fopen(argv[1], "r");
if (!fp)
{
printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
exit(1);
}
// Print environment variables and CSV header
ev.DisplayEnvVars();
int testNum = 0;
char line[MAX_LINE_LEN];
while(fgets(line, MAX_LINE_LEN, fp))
{
// Check if line is a comment to be echoed to output (starts with ##)
if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);
// Parse set of parallel Transfers to execute
std::vector<Transfer> transfers;
ParseTransfers(ev, line, transfers);
if (transfers.empty()) continue;
// If the number of bytes is specified, use it
if (numBytesPerTransfer != 0)
{
size_t N = numBytesPerTransfer / sizeof(float);
ExecuteTransfers(ev, ++testNum, N, transfers);
}
else
{
// Otherwise generate a range of values
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = N;
while (curr < N * 2)
{
ExecuteTransfers(ev, ++testNum, curr, transfers);
curr += delta;
}
}
}
}
fclose(fp);
return 0;
}
void ExecuteTransfers(EnvVars const& ev,
int const testNum,
size_t const N,
std::vector<Transfer>& transfers,
bool verbose,
double* totalBandwidthCpu)
{
// Check for any Transfers using variable number of sub-executors
std::vector<int> varTransfers;
std::vector<int> numUsedSubExec(ev.numGpuDevices, 0);
std::vector<int> numVarSubExec(ev.numGpuDevices, 0);
for (int i = 0; i < transfers.size(); i++) {
Transfer& t = transfers[i];
t.transferIndex = i;
t.numBytesActual = (t.numBytes ? t.numBytes : N * sizeof(float));
if (t.exeType == EXE_GPU_GFX) {
if (t.numSubExecs == 0) {
varTransfers.push_back(i);
numVarSubExec[t.exeIndex]++;
} else {
numUsedSubExec[t.exeIndex] += t.numSubExecs;
}
} else if (t.numSubExecs == 0) {
printf("[ERROR] Variable subexecutor count is only supported for GFX executor\n");
exit(1);
}
}
if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
TestResults testResults;
if (varTransfers.size() == 0) {
testResults = ExecuteTransfersImpl(ev, transfers);
} else {
// Determine maximum number of subexecutors
int maxNumSubExec = 0;
if (ev.maxNumVarSubExec) {
maxNumSubExec = ev.maxNumVarSubExec;
} else {
HIP_CALL(hipDeviceGetAttribute(&maxNumSubExec, hipDeviceAttributeMultiprocessorCount, 0));
for (int device = 0; device < ev.numGpuDevices; device++) {
int numSubExec = 0;
HIP_CALL(hipDeviceGetAttribute(&numSubExec, hipDeviceAttributeMultiprocessorCount, device));
int leftOverSubExec = numSubExec - numUsedSubExec[device];
if (leftOverSubExec < numVarSubExec[device])
maxNumSubExec = 1;
else if (numVarSubExec[device] != 0) {
maxNumSubExec = std::min(maxNumSubExec, leftOverSubExec / numVarSubExec[device]);
}
}
}
// Loop over subexecs
std::vector<Transfer> bestTransfers;
for (int numSubExec = ev.minNumVarSubExec; numSubExec <= maxNumSubExec; numSubExec++) {
std::vector<Transfer> currTransfers = transfers;
for (auto idx : varTransfers) {
currTransfers[idx].numSubExecs = numSubExec;
}
TestResults tempResults = ExecuteTransfersImpl(ev, currTransfers);
if (tempResults.totalBandwidthCpu > testResults.totalBandwidthCpu) {
bestTransfers = currTransfers;
testResults = tempResults;
}
}
transfers = bestTransfers;
}
if (totalBandwidthCpu) *totalBandwidthCpu = testResults.totalBandwidthCpu;
if (verbose) {
ReportResults(ev, transfers, testResults);
}
}
TestResults ExecuteTransfersImpl(EnvVars const& ev,
std::vector<Transfer>& transfers)
{
// Map transfers by executor
TransferMap transferMap;
for (int i = 0; i < transfers.size(); i++)
{
Transfer& transfer = transfers[i];
transfer.transferIndex = i;
Executor executor(transfer.exeType, transfer.exeIndex);
ExecutorInfo& executorInfo = transferMap[executor];
executorInfo.transfers.push_back(&transfer);
}
// Loop over each executor and prepare sub-executors
std::map<int, Transfer*> transferList;
for (auto& exeInfoPair : transferMap)
{
Executor const& executor = exeInfoPair.first;
ExecutorInfo& exeInfo = exeInfoPair.second;
ExeType const exeType = executor.first;
int const exeIndex = RemappedIndex(executor.second, IsCpuType(exeType));
exeInfo.totalTime = 0.0;
exeInfo.totalSubExecs = 0;
// Loop over each transfer this executor is involved in
for (Transfer* transfer : exeInfo.transfers)
{
// Allocate source memory
transfer->srcMem.resize(transfer->numSrcs);
for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
{
MemType const& srcType = transfer->srcType[iSrc];
int const srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
// Ensure executing GPU can access source memory
if (IsGpuType(exeType) && IsGpuType(srcType) && srcIndex != exeIndex)
EnablePeerAccess(exeIndex, srcIndex);
AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
}
// Allocate destination memory
transfer->dstMem.resize(transfer->numDsts);
for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
{
MemType const& dstType = transfer->dstType[iDst];
int const dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
// Ensure executing GPU can access destination memory
if (IsGpuType(exeType) && IsGpuType(dstType) && dstIndex != exeIndex)
EnablePeerAccess(exeIndex, dstIndex);
AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
}
exeInfo.totalSubExecs += transfer->numSubExecs;
transferList[transfer->transferIndex] = transfer;
}
// Prepare additional requirement for GPU-based executors
if (IsGpuType(exeType))
{
HIP_CALL(hipSetDevice(exeIndex));
// Single-stream is only supported for GFX-based executors
int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
exeInfo.streams.resize(numStreamsToUse);
exeInfo.startEvents.resize(numStreamsToUse);
exeInfo.stopEvents.resize(numStreamsToUse);
for (int i = 0; i < numStreamsToUse; ++i)
{
if (ev.cuMask.size())
{
#if !defined(__NVCC__)
HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data()));
#else
printf("[ERROR] CU Masking in not supported on NVIDIA hardware\n");
exit(-1);
#endif
}
else
{
HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
}
HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
}
if (exeType == EXE_GPU_GFX)
{
// Allocate one contiguous chunk of GPU memory for threadblock parameters
// This allows support for executing one transfer per stream, or all transfers in a single stream
#if !defined(__NVCC__)
AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
(void**)&exeInfo.subExecParamGpu);
#else
AllocateMemory(MEM_MANAGED, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
(void**)&exeInfo.subExecParamGpu);
#endif
// Check for sufficient subExecutors
int numDeviceCUs = 0;
HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex));
if (exeInfo.totalSubExecs > numDeviceCUs)
{
printf("[WARN] GFX executor %d requesting %d total subexecutors, however only has %d. Some Transfers may be serialized\n",
exeIndex, exeInfo.totalSubExecs, numDeviceCUs);
}
}
// Check for targeted DMA
if (exeType == EXE_GPU_DMA)
{
bool useRandomDma = false;
bool useTargetDma = false;
// Check for sufficient hardware queues
#if !defined(__NVCC_)
if (exeInfo.transfers.size() > ev.gpuMaxHwQueues)
{
printf("[WARN] DMA executor %d attempting %lu parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
exeIndex, exeInfo.transfers.size(), ev.gpuMaxHwQueues);
}
#endif
for (Transfer* transfer : exeInfo.transfers)
{
if (transfer->exeSubIndex != -1 || ev.useHsaDma)
{
useTargetDma = (transfer->exeSubIndex != -1);
#if defined(__NVCC__)
printf("[ERROR] DMA executor subindex not supported on NVIDIA hardware\n");
exit(-1);
#else
if (transfer->numSrcs != 1 || transfer->numDsts != 1)
{
printf("[ERROR] DMA Transfer must have at exactly one source and one destination");
exit(1);
}
// Collect HSA agent information
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
HSA_CHECK(hsa_amd_pointer_info(transfer->dstMem[0], &info, NULL, NULL, NULL));
transfer->dstAgent = info.agentOwner;
HSA_CHECK(hsa_amd_pointer_info(transfer->srcMem[0], &info, NULL, NULL, NULL));
transfer->srcAgent = info.agentOwner;
// Create HSA completion signal
HSA_CHECK(hsa_signal_create(1, 0, NULL, &transfer->signal));
// Check for valid engine Id
if (transfer->exeSubIndex < -1 || transfer->exeSubIndex >= 32)
{
printf("[ERROR] DMA executor subindex must be between 0 and 31\n");
exit(1);
}
// Check that engine Id exists between agents
if (useTargetDma) {
uint32_t engineIdMask = 0;
HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent,
transfer->srcAgent,
&engineIdMask));
transfer->sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << transfer->exeSubIndex);
if (!(transfer->sdmaEngineId & engineIdMask))
{
printf("[ERROR] DMA executor %d.%d does not exist or cannot copy between source %s to destination %s\n",
transfer->exeIndex, transfer->exeSubIndex,
transfer->SrcToStr().c_str(),
transfer->DstToStr().c_str());
exit(1);
}
}
#endif
}
else
{
useRandomDma = true;
}
}
if (useRandomDma && useTargetDma)
{
printf("[WARN] Mixing targeted and untargetted DMA execution on GPU %d may result in resource conflicts\n",
exeIndex);
}
}
}
}
// Prepare input memory and block parameters for current N
bool isSrcCorrect = true;
for (auto& exeInfoPair : transferMap)
{
Executor const& executor = exeInfoPair.first;
ExecutorInfo& exeInfo = exeInfoPair.second;
ExeType const exeType = executor.first;
int const exeIndex = RemappedIndex(executor.second, IsCpuType(exeType));
exeInfo.totalBytes = 0;
for (int i = 0; i < exeInfo.transfers.size(); ++i)
{
// Prepare subarrays each threadblock works on and fill src memory with patterned data
Transfer* transfer = exeInfo.transfers[i];
transfer->PrepareSubExecParams(ev);
isSrcCorrect &= transfer->PrepareSrc(ev);
exeInfo.totalBytes += transfer->numBytesActual;
}
// Copy block parameters to GPU for GPU executors
if (exeType == EXE_GPU_GFX)
{
std::vector<SubExecParam> tempSubExecParam;
if (!ev.useSingleStream || (ev.blockOrder == ORDER_SEQUENTIAL))
{
// Assign Transfers to sequentual threadblocks
int transferOffset = 0;
for (Transfer* transfer : exeInfo.transfers)
{
transfer->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
transfer->subExecIdx.clear();
for (int subExecIdx = 0; subExecIdx < transfer->subExecParam.size(); subExecIdx++)
{
transfer->subExecIdx.push_back(transferOffset + subExecIdx);
tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
}
transferOffset += transfer->numSubExecs;
}
}
else if (ev.blockOrder == ORDER_INTERLEAVED)
{
// Interleave threadblocks of different Transfers
exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
for (int subExecIdx = 0; tempSubExecParam.size() < exeInfo.totalSubExecs; ++subExecIdx)
{
for (Transfer* transfer : exeInfo.transfers)
{
if (subExecIdx < transfer->numSubExecs)
{
transfer->subExecIdx.push_back(tempSubExecParam.size());
tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
}
}
}
}
else if (ev.blockOrder == ORDER_RANDOM)
{
std::vector<std::pair<int,int>> indices;
exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
// Build up a list of (transfer,subExecParam) indices, then randomly sort them
for (int i = 0; i < exeInfo.transfers.size(); i++)
{
Transfer* transfer = exeInfo.transfers[i];
for (int subExecIdx = 0; subExecIdx < transfer->numSubExecs; subExecIdx++)
indices.push_back(std::make_pair(i, subExecIdx));
}
std::shuffle(indices.begin(), indices.end(), *ev.generator);
// Build randomized threadblock list
for (auto p : indices)
{
Transfer* transfer = exeInfo.transfers[p.first];
transfer->subExecIdx.push_back(tempSubExecParam.size());
tempSubExecParam.push_back(transfer->subExecParam[p.second]);
}
}
HIP_CALL(hipSetDevice(exeIndex));
HIP_CALL(hipMemcpy(exeInfo.subExecParamGpu,
tempSubExecParam.data(),
tempSubExecParam.size() * sizeof(SubExecParam),
hipMemcpyDefault));
HIP_CALL(hipDeviceSynchronize());
}
}
// Launch kernels (warmup iterations are not counted)
double totalCpuTime = 0;
size_t numTimedIterations = 0;
std::stack<std::thread> threads;
for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
{
if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
// Pause before starting first timed iteration in interactive mode
if (ev.useInteractive && iteration == 0)
{
printf("Memory prepared:\n");
for (Transfer& transfer : transfers)
{
printf("Transfer %03d:\n", transfer.transferIndex);
for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
printf(" SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
printf(" DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
}
printf("Hit <Enter> to continue: ");
if (scanf("%*c") != 0)
{
printf("[ERROR] Unexpected input\n");
exit(1);
}
printf("\n");
}
// Start CPU timing for this iteration
auto cpuStart = std::chrono::high_resolution_clock::now();
// Execute all Transfers in parallel
for (auto& exeInfoPair : transferMap)
{
ExecutorInfo& exeInfo = exeInfoPair.second;
ExeType exeType = exeInfoPair.first.first;
int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();
for (int i = 0; i < numTransfersToRun; ++i)
threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
}
// Wait for all threads to finish
int const numTransfers = threads.size();
for (int i = 0; i < numTransfers; i++)
{
threads.top().join();
threads.pop();
}
// Stop CPU timing for this iteration
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
if (ev.alwaysValidate)
{
for (auto transferPair : transferList)
{
Transfer* transfer = transferPair.second;
transfer->ValidateDst(ev);
}
}
if (iteration >= 0)
{
++numTimedIterations;
totalCpuTime += deltaSec;
}
}
// Pause for interactive mode
if (isSrcCorrect && ev.useInteractive)
{
printf("Transfers complete. Hit <Enter> to continue: ");
if (scanf("%*c") != 0)
{
printf("[ERROR] Unexpected input\n");
exit(1);
}
printf("\n");
}
// Validate that each transfer has transferred correctly
size_t totalBytesTransferred = 0;
int const numTransfers = transferList.size();
for (auto transferPair : transferList)
{
Transfer* transfer = transferPair.second;
transfer->ValidateDst(ev);
totalBytesTransferred += transfer->numBytesActual;
}
// Record results
TestResults testResults;
testResults.numTimedIterations = numTimedIterations;
testResults.totalBytesTransferred = totalBytesTransferred;
testResults.totalDurationMsec = totalCpuTime / (1.0 * numTimedIterations * ev.numSubIterations) * 1000;
testResults.totalBandwidthCpu = (totalBytesTransferred / 1.0E6) / testResults.totalDurationMsec;
double maxExeDurationMsec = 0.0;
if (!isSrcCorrect) goto cleanup;
for (auto& exeInfoPair : transferMap)
{
ExecutorInfo exeInfo = exeInfoPair.second;
ExeType const exeType = exeInfoPair.first.first;
int const exeIndex = exeInfoPair.first.second;
ExeResult& exeResult = testResults.exeResults[std::make_pair(exeType, exeIndex)];
// Compute total time for non GPU executors
if (exeType != EXE_GPU_GFX || ev.useSingleStream == 0)
{
exeInfo.totalTime = 0;
for (auto const& transfer : exeInfo.transfers)
exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
}
exeResult.totalBytes = exeInfo.totalBytes;
exeResult.durationMsec = exeInfo.totalTime / (1.0 * numTimedIterations * ev.numSubIterations);
exeResult.bandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeResult.durationMsec * 1000.0f;
exeResult.sumBandwidthGbs = 0;
maxExeDurationMsec = std::max(maxExeDurationMsec, exeResult.durationMsec);
for (auto& transfer: exeInfo.transfers)
{
exeResult.transferIdx.push_back(transfer->transferIndex);
transfer->transferTime /= (1.0 * numTimedIterations * ev.numSubIterations);
transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = exeResult.bandwidthGbs;
exeResult.sumBandwidthGbs += transfer->transferBandwidth;
}
}
testResults.overheadMsec = testResults.totalDurationMsec - maxExeDurationMsec;
// Release GPU memory
cleanup:
for (auto exeInfoPair : transferMap)
{
ExecutorInfo& exeInfo = exeInfoPair.second;
ExeType const exeType = exeInfoPair.first.first;
int const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));
for (auto& transfer : exeInfo.transfers)
{
for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
{
MemType const& srcType = transfer->srcType[iSrc];
DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
}
for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
{
MemType const& dstType = transfer->dstType[iDst];
DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
}
transfer->subExecParam.clear();
if (exeType == EXE_GPU_DMA && (transfer->exeSubIndex != -1 || ev.useHsaDma))
{
#if !defined(__NVCC__)
HSA_CHECK(hsa_signal_destroy(transfer->signal));
#endif
}
}
if (IsGpuType(exeType))
{
int const numStreams = (int)exeInfo.streams.size();
for (int i = 0; i < numStreams; ++i)
{
HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
}
if (exeType == EXE_GPU_GFX)
{
#if !defined(__NVCC__)
DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
#else
DeallocateMemory(MEM_MANAGED, exeInfo.subExecParamGpu);
#endif
}
}
}
return testResults;
}
void DisplayUsage(char const* cmdName)
{
printf("TransferBench v%s\n", TB_VERSION);
printf("========================================\n");
if (numa_available() == -1)
{
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
int const numCpuDevices = numa_num_configured_nodes();
printf("Usage: %s config <N>\n", cmdName);
printf(" config: Either:\n");
printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
printf(" - Name of preset config:\n");
printf(" a2a - GPU All-To-All benchmark\n");
printf(" - 3rd optional arg: # of SubExecs to use\n");
printf(" cmdline - Read Transfers from command line arguments (after N)\n");
printf(" healthcheck - Simple bandwidth health check (MI300 series only)\n");
printf(" p2p - Peer-to-peer benchmark tests\n");
printf(" rwrite/pcopy - Parallel writes/copies from single GPU to other GPUs\n");
printf(" - 3rd optional arg: # GPU SubExecs per Transfer\n");
printf(" - 4th optional arg: Root GPU index\n");
printf(" - 5th optional arg: Min number of other GPUs to transfer to\n");
printf(" - 6th optional arg: Max number of other GPUs to transfer to\n");
printf(" sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
printf(" - 3rd optional arg: # GPU SubExecs per Transfer\n");
printf(" - 4th optional arg: # CPU SubExecs per Transfer\n");
printf(" scaling - GPU GFX SubExec scaling copy test\n");
printf(" - 3th optional arg: Max # of SubExecs to use\n");
printf(" - 4rd optional arg: GPU index to use as executor\n");
printf(" schmoo - Local/RemoteRead/Write/Copy between two GPUs\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
printf("\n");
EnvVars::DisplayUsage();
}
int RemappedIndex(int const origIdx, bool const isCpuType)
{
static std::vector<int> remappingCpu;
static std::vector<int> remappingGpu;
// Build CPU remapping on first use
// Skip numa nodes that are not configured
if (remappingCpu.empty())
{
for (int node = 0; node <= numa_max_node(); node++)
if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
remappingCpu.push_back(node);
}
// Build remappingGpu on first use
if (remappingGpu.empty())
{
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
remappingGpu.resize(numGpuDevices);
int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
if (!usePcieIndexing)
{
// For HIP-based indexing no remappingGpu is necessary
for (int i = 0; i < numGpuDevices; ++i)
remappingGpu[i] = i;
}
else
{
// Collect PCIe address for each GPU
std::vector<std::pair<std::string, int>> mapping;
char pciBusId[20];
for (int i = 0; i < numGpuDevices; ++i)
{
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
mapping.push_back(std::make_pair(pciBusId, i));
}
// Sort GPUs by PCIe address then use that as mapping
std::sort(mapping.begin(), mapping.end());
for (int i = 0; i < numGpuDevices; ++i)
remappingGpu[i] = mapping[i].second;
}
}
return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
}
void DisplayTopology(bool const outputToCsv)
{
int numCpuDevices = numa_num_configured_nodes();
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
if (outputToCsv)
{
printf("NumCpus,%d\n", numCpuDevices);
printf("NumGpus,%d\n", numGpuDevices);
}
else
{
printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total] %d GPU device(s)\n",
numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
}
// Print out detected CPU topology
if (outputToCsv)
{
printf("NUMA");
for (int j = 0; j < numCpuDevices; j++)
printf(",NUMA%02d", j);
printf(",# CPUs,ClosestGPUs,ActualNode\n");
}
else
{
printf(" |");
for (int j = 0; j < numCpuDevices; j++)
printf("NUMA %02d|", j);
printf(" #Cpus | Closest GPU(s)\n");
printf("------------+");
for (int j = 0; j <= numCpuDevices; j++)
printf("-------+");
printf("---------------\n");
}
for (int i = 0; i < numCpuDevices; i++)
{
int nodeI = RemappedIndex(i, true);
printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
for (int j = 0; j < numCpuDevices; j++)
{
int nodeJ = RemappedIndex(j, true);
int numaDist = numa_distance(nodeI, nodeJ);
if (outputToCsv)
printf("%d,", numaDist);
else
printf(" %5d |", numaDist);
}
int numCpus = 0;
for (int j = 0; j < numa_num_configured_cpus(); j++)
if (numa_node_of_cpu(j) == nodeI) numCpus++;
if (outputToCsv)
printf("%d,", numCpus);
else
printf(" %5d | ", numCpus);
#if !defined(__NVCC__)
bool isFirst = true;
for (int j = 0; j < numGpuDevices; j++)
{
if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
{
if (isFirst) isFirst = false;
else printf(",");
printf("%d", j);
}
}
#endif
printf("\n");
}
printf("\n");
#if defined(__NVCC__)
for (int i = 0; i < numGpuDevices; i++)
{
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, i));
printf(" GPU %02d | %s\n", i, prop.name);
}
// No further topology detection done for NVIDIA platforms
return;
#else
// Figure out DMA engines per GPU
std::vector<std::set<int>> dmaEngineIdsPerDevice(numGpuDevices);
{
std::vector<hsa_agent_t> gpuAgentList;
std::vector<hsa_agent_t> allAgentList;
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
for (int deviceId = 0; deviceId < numGpuDevices; deviceId++)
{
HIP_CALL(hipSetDevice(deviceId));
int32_t* tempGpuBuffer;
HIP_CALL(hipMalloc((void**)&tempGpuBuffer, 1024));
HSA_CHECK(hsa_amd_pointer_info(tempGpuBuffer, &info, NULL, NULL, NULL));
gpuAgentList.push_back(info.agentOwner);
allAgentList.push_back(info.agentOwner);
HIP_CALL(hipFree(tempGpuBuffer));
}
for (int deviceId = 0; deviceId < numCpuDevices; deviceId++)
{
int32_t* tempCpuBuffer;
AllocateMemory(MEM_CPU, deviceId, 1024, (void**)&tempCpuBuffer);
HSA_CHECK(hsa_amd_pointer_info(tempCpuBuffer, &info, NULL, NULL, NULL));
allAgentList.push_back(info.agentOwner);
DeallocateMemory(MEM_CPU, tempCpuBuffer, 1024);
}
for (int srcDevice = 0; srcDevice < numGpuDevices; srcDevice++)
{
dmaEngineIdsPerDevice[srcDevice].clear();
for (int dstDevice = 0; dstDevice < allAgentList.size(); dstDevice++)
{
if (srcDevice == dstDevice) continue;
uint32_t engineIdMask = 0;
if (hsa_amd_memory_copy_engine_status(allAgentList[dstDevice],
gpuAgentList[srcDevice],
&engineIdMask) != HSA_STATUS_SUCCESS)
continue;
for (int engineId = 0; engineId < 32; engineId++)
{
if (engineIdMask & (1U << engineId))
dmaEngineIdsPerDevice[srcDevice].insert(engineId);
}
}
}
}
// Print out detected GPU topology
if (outputToCsv)
{
printf("GPU");
for (int j = 0; j < numGpuDevices; j++)
printf(",GPU %02d", j);
printf(",PCIe Bus ID,ClosestNUMA,DMA engines\n");
}
else
{
printf(" |");
for (int j = 0; j < numGpuDevices; j++)
{
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, j));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
printf(" %6s |", archName.c_str());
}
printf("\n");
printf(" |");
for (int j = 0; j < numGpuDevices; j++)
printf(" GPU %02d |", j);
printf(" PCIe Bus ID | #CUs | Closest NUMA | DMA engines\n");
for (int j = 0; j <= numGpuDevices; j++)
printf("--------+");
printf("--------------+------+-------------+------------\n");
}
char pciBusId[20];
for (int i = 0; i < numGpuDevices; i++)
{
int const deviceIdx = RemappedIndex(i, false);
printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
for (int j = 0; j < numGpuDevices; j++)
{
if (i == j)
{
if (outputToCsv)
printf("-,");
else
printf(" - |");
}
else
{
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
RemappedIndex(j, false),
&linkType, &hopCount));
printf("%s%s-%d%s",
outputToCsv ? "" : " ",
linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT" :
linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI" :
linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE" :
linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFB" :
linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI" : "????",
hopCount, outputToCsv ? "," : " |");
}
}
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));
int numDeviceCUs = 0;
HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
if (outputToCsv)
printf("%s,%d,%d,", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
else
{
printf(" %11s | %4d | %-12d |", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
bool isFirst = true;
for (auto x : dmaEngineIdsPerDevice[deviceIdx])
{
if (isFirst) isFirst = false; else printf(",");
printf("%d", x);
}
printf("\n");
}
}
// Check that large BAR is enabled on all GPUs
for (int i = 0; i < numGpuDevices; i++) {
int const deviceIdx = RemappedIndex(i, false);
int isLargeBar = 0;
HIP_CALL(hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, deviceIdx));
if (!isLargeBar)
printf("[WARN] Large BAR is not enabled for GPU %d in BIOS. This may result in segfaults\n", i);
}
#endif
}
void ParseMemType(EnvVars const& ev, std::string const& token,
std::vector<MemType>& memTypes, std::vector<int>& memIndices)
{
char typeChar;
int offset = 0, devIndex, inc;
bool found = false;
memTypes.clear();
memIndices.clear();
while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
{
offset += inc;
MemType memType = CharToMemType(typeChar);
if (IsCpuType(memType) && (devIndex < 0 || devIndex >= ev.numCpuDevices))
{
printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", ev.numCpuDevices-1, devIndex);
exit(1);
}
if (IsGpuType(memType) && (devIndex < 0 || devIndex >= ev.numGpuDevices))
{
printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", ev.numGpuDevices-1, devIndex);
exit(1);
}
found = true;
if (memType != MEM_NULL)
{
memTypes.push_back(memType);
memIndices.push_back(devIndex);
}
}
if (!found)
{
printf("[ERROR] Unable to parse memory type token %s. Expected one of %s followed by an index\n",
token.c_str(), MemTypeStr);
exit(1);
}
}
void ParseExeType(EnvVars const& ev, std::string const& token,
ExeType &exeType, int& exeIndex, int& exeSubIndex)
{
char typeChar;
exeSubIndex = -1;
int numTokensParsed = sscanf(token.c_str(), " %c%d.%d", &typeChar, &exeIndex, &exeSubIndex);
if (numTokensParsed < 2)
{
printf("[ERROR] Unable to parse valid executor token (%s). Exepected one of %s followed by an index\n",
token.c_str(), ExeTypeStr);
exit(1);
}
exeType = CharToExeType(typeChar);
if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= ev.numCpuDevices))
{
printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", ev.numCpuDevices-1, exeIndex);
exit(1);
}
if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= ev.numGpuDevices))
{
printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", ev.numGpuDevices-1, exeIndex);
exit(1);
}
if (exeType == EXE_GPU_GFX && exeSubIndex != -1)
{
int const idx = RemappedIndex(exeIndex, false);
if (ev.xccIdsPerDevice[idx].count(exeSubIndex) == 0)
{
printf("[ERROR] GPU %d does not have subIndex %d\n", exeIndex, exeSubIndex);
exit(1);
}
}
}
// Helper function to parse a list of Transfer definitions
void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transfers)
{
// Replace any round brackets or '->' with spaces,
for (int i = 1; line[i]; i++)
if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
transfers.clear();
int numTransfers = 0;
std::istringstream iss(line);
iss >> numTransfers;
if (iss.fail()) return;
std::string exeMem;
std::string srcMem;
std::string dstMem;
// If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
// otherwise read triples (srcMem, exeMem, dstMem)
bool const advancedMode = (numTransfers < 0);
numTransfers = abs(numTransfers);
int numSubExecs;
if (!advancedMode)
{
iss >> numSubExecs;
if (numSubExecs < 0 || iss.fail())
{
printf("Parsing error: Number of blocks to use (%d) must be non-negative\n", numSubExecs);
exit(1);
}
}
size_t numBytes = 0;
for (int i = 0; i < numTransfers; i++)
{
Transfer transfer;
transfer.numBytes = 0;
transfer.numBytesActual = 0;
if (!advancedMode)
{
iss >> srcMem >> exeMem >> dstMem;
if (iss.fail())
{
printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
exit(1);
}
}
else
{
std::string numBytesToken;
iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
if (iss.fail())
{
printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
exit(1);
}
if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
{
printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
exit(1);
}
char units = numBytesToken.back();
switch (toupper(units))
{
case 'K': numBytes *= 1024; break;
case 'M': numBytes *= 1024*1024; break;
case 'G': numBytes *= 1024*1024*1024; break;
}
}
ParseMemType(ev, srcMem, transfer.srcType, transfer.srcIndex);
ParseMemType(ev, dstMem, transfer.dstType, transfer.dstIndex);
ParseExeType(ev, exeMem, transfer.exeType, transfer.exeIndex, transfer.exeSubIndex);
transfer.numSrcs = (int)transfer.srcType.size();
transfer.numDsts = (int)transfer.dstType.size();
if (transfer.numSrcs == 0 && transfer.numDsts == 0)
{
printf("[ERROR] Transfer must have at least one src or dst\n");
exit(1);
}
if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs != 1 || transfer.numDsts != 1))
{
printf("[ERROR] GPU DMA executor can only be used for single source + single dst copies\n");
exit(1);
}
transfer.numSubExecs = numSubExecs;
transfer.numBytes = numBytes;
transfers.push_back(transfer);
}
}
void EnablePeerAccess(int const deviceId, int const peerDeviceId)
{
int canAccess;
HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
if (!canAccess)
{
printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
exit(1);
}
HIP_CALL(hipSetDevice(deviceId));
hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
{
printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
deviceId, peerDeviceId, hipGetErrorString(error));
exit(1);
}
}
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
{
if (numBytes == 0)
{
printf("[ERROR] Unable to allocate 0 bytes\n");
exit(1);
}
*memPtr = nullptr;
if (IsCpuType(memType))
{
// Set numa policy prior to call to hipHostMalloc
numa_set_preferred(devIndex);
// Allocate host-pinned memory (should respect NUMA mem policy)
if (memType == MEM_CPU_FINE)
{
#if defined (__NVCC__)
printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
exit(1);
#else
HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
#endif
}
else if (memType == MEM_CPU)
{
#if defined (__NVCC__)
if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
#else
if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
#endif
{
printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
exit(1);
}
}
else if (memType == MEM_CPU_UNPINNED)
{
*memPtr = numa_alloc_onnode(numBytes, devIndex);
}
// Check that the allocated pages are actually on the correct NUMA node
memset(*memPtr, 0, numBytes);
CheckPages((char*)*memPtr, numBytes, devIndex);
// Reset to default numa mem policy
numa_set_preferred(-1);
}
else if (IsGpuType(memType))
{
if (memType == MEM_GPU)
{
// Allocate GPU memory on appropriate device
HIP_CALL(hipSetDevice(devIndex));
HIP_CALL(hipMalloc((void**)memPtr, numBytes));
}
else if (memType == MEM_GPU_FINE)
{
#if defined (__NVCC__)
printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
exit(1);
#else
HIP_CALL(hipSetDevice(devIndex));
int flag = hipDeviceMallocUncached;
HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
#endif
}
else if (memType == MEM_MANAGED)
{
HIP_CALL(hipSetDevice(devIndex));
HIP_CALL(hipMallocManaged((void**)memPtr, numBytes));
}
HIP_CALL(hipMemset(*memPtr, 0, numBytes));
HIP_CALL(hipDeviceSynchronize());
}
else
{
printf("[ERROR] Unsupported memory type %d\n", memType);
exit(1);
}
}
void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
{
if (memType == MEM_CPU || memType == MEM_CPU_FINE)
{
if (memPtr == nullptr)
{
printf("[ERROR] Attempting to free null CPU pointer for %lu bytes. Skipping hipHostFree\n", bytes);
return;
}
HIP_CALL(hipHostFree(memPtr));
}
else if (memType == MEM_CPU_UNPINNED)
{
if (memPtr == nullptr)
{
printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes. Skipping numa_free\n", bytes);
return;
}
numa_free(memPtr, bytes);
}
else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
{
if (memPtr == nullptr)
{
printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
return;
}
HIP_CALL(hipFree(memPtr));
}
else if (memType == MEM_MANAGED)
{
if (memPtr == nullptr)
{
printf("[ERROR] Attempting to free null managed pointer for %lu bytes. Skipping hipMFree\n", bytes);
return;
}
HIP_CALL(hipFree(memPtr));
}
}
void CheckPages(char* array, size_t numBytes, int targetId)
{
unsigned long const pageSize = getpagesize();
unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
std::vector<void *> pages(numPages);
std::vector<int> status(numPages);
pages[0] = array;
for (int i = 1; i < numPages; i++)
{
pages[i] = (char*)pages[i-1] + pageSize;
}
long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
if (retCode)
{
printf("[ERROR] Unable to collect page info\n");
exit(1);
}
size_t mistakeCount = 0;
for (int i = 0; i < numPages; i++)
{
if (status[i] < 0)
{
printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
exit(1);
}
if (status[i] != targetId) mistakeCount++;
}
if (mistakeCount > 0)
{
printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
exit(1);
}
}
uint32_t GetId(uint32_t hwId)
{
#if defined(__NVCC_)
return hwId;
#else
// Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
int const shId = (hwId >> 12) & 1;
int const cuId = (hwId >> 8) & 15;
int const seId = (hwId >> 13) & 3;
return (shId << 5) + (cuId << 2) + seId;
#endif
}
void RunTransfer(EnvVars const& ev, int const iteration,
ExecutorInfo& exeInfo, int const transferIdx)
{
Transfer* transfer = exeInfo.transfers[transferIdx];
if (transfer->exeType == EXE_GPU_GFX)
{
// Switch to executing GPU
int const exeIndex = RemappedIndex(transfer->exeIndex, false);
HIP_CALL(hipSetDevice(exeIndex));
hipStream_t& stream = exeInfo.streams[transferIdx];
hipEvent_t& startEvent = exeInfo.startEvents[transferIdx];
hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
// Figure out how many threadblocks to use.
// In single stream mode, all the threadblocks for this GPU are launched
// Otherwise, just launch the threadblocks associated with this single Transfer
int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);
dim3 const gridSize(numXCCs, numBlocksToRun, 1);
dim3 const blockSize(ev.gfxBlockSize, 1, 1);
#if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1]
<<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
HIP_CALL(hipEventRecord(stopEvent, stream));
#else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1],
gridSize, blockSize,
ev.sharedMemBytes, stream,
startEvent, stopEvent,
0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
#endif
// Synchronize per iteration, unless in single sync mode, in which case
// synchronize during last warmup / last actual iteration
HIP_CALL(hipStreamSynchronize(stream));
if (iteration >= 0)
{
// Record GPU timing
float gpuDeltaMsec;
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
if (ev.useSingleStream)
{
// Figure out individual timings for Transfers that were all launched together
for (Transfer* currTransfer : exeInfo.transfers)
{
long long minStartCycle = std::numeric_limits<long long>::max();
long long maxStopCycle = std::numeric_limits<long long>::min();
std::set<std::pair<int,int>> CUs;
for (auto subExecIdx : currTransfer->subExecIdx)
{
minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle);
if (ev.showIterations)
CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
}
int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
currTransfer->transferTime += iterationTimeMs;
if (ev.showIterations)
{
currTransfer->perIterationTime.push_back(iterationTimeMs);
currTransfer->perIterationCUs.push_back(CUs);
}
}
exeInfo.totalTime += gpuDeltaMsec;
}
else
{
transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations)
{
transfer->perIterationTime.push_back(gpuDeltaMsec);
std::set<std::pair<int,int>> CUs;
for (int i = 0; i < transfer->numSubExecs; i++)
CUs.insert(std::make_pair(transfer->subExecParamGpuPtr[i].xccId,
GetId(transfer->subExecParamGpuPtr[i].hwId)));
transfer->perIterationCUs.push_back(CUs);
}
}
}
}
else if (transfer->exeType == EXE_GPU_DMA)
{
int const exeIndex = RemappedIndex(transfer->exeIndex, false);
int subIterations = 0;
if (transfer->exeSubIndex == -1 && !ev.useHsaDma) {
// Switch to executing GPU
HIP_CALL(hipSetDevice(exeIndex));
hipStream_t& stream = exeInfo.streams[transferIdx];
hipEvent_t& startEvent = exeInfo.startEvents[transferIdx];
hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
HIP_CALL(hipEventRecord(startEvent, stream));
do {
HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
transfer->numBytesActual, hipMemcpyDefault,
stream));
} while (++subIterations != ev.numSubIterations);
HIP_CALL(hipEventRecord(stopEvent, stream));
HIP_CALL(hipStreamSynchronize(stream));
// Record time based on HIP events
if (iteration >= 0) {
float gpuDeltaMsec;
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations)
transfer->perIterationTime.push_back(gpuDeltaMsec);
}
} else {
#if defined(__NVCC__)
printf("[ERROR] CUDA does not support targeting specific DMA engines\n");
exit(1);
#else
// Use hsa_amd_memory copy (either targeted or untargeted)
auto cpuStart = std::chrono::high_resolution_clock::now();
do {
// Atomically set signal to 1
HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
if (ev.useHsaDma) {
HSA_CALL(hsa_amd_memory_async_copy(transfer->dstMem[0], transfer->dstAgent,
transfer->srcMem[0], transfer->srcAgent,
transfer->numBytesActual, 0, NULL,
transfer->signal));
} else {
HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
transfer->srcMem[0], transfer->srcAgent,
transfer->numBytesActual, 0, NULL,
transfer->signal,
transfer->sdmaEngineId, true));
}
// Wait for SDMA transfer to complete
while(hsa_signal_wait_scacquire(transfer->signal,
HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
HSA_WAIT_STATE_ACTIVE) >= 1);
} while (++subIterations < ev.numSubIterations);
if (iteration >= 0) {
// Record GPU timing
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
transfer->transferTime += deltaMsec;
if (ev.showIterations)
transfer->perIterationTime.push_back(deltaMsec);
}
#endif
}
}
else if (transfer->exeType == EXE_CPU) // CPU execution agent
{
// Force this thread and all child threads onto correct NUMA node
int const exeIndex = RemappedIndex(transfer->exeIndex, true);
if (numa_run_on_node(exeIndex))
{
printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
exit(1);
}
std::vector<std::thread> childThreads;
int subIteration = 0;
auto cpuStart = std::chrono::high_resolution_clock::now();
do {
// Launch each subExecutor in child-threads to perform memcopies
for (int i = 0; i < transfer->numSubExecs; ++i)
childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
// Wait for child-threads to finish
for (int i = 0; i < transfer->numSubExecs; ++i)
childThreads[i].join();
childThreads.clear();
} while (++subIteration != ev.numSubIterations);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
// Record time if not a warmup iteration
if (iteration >= 0)
{
double const delta = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
transfer->transferTime += delta;
if (ev.showIterations)
transfer->perIterationTime.push_back(delta);
}
}
}
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
{
ev.DisplayP2PBenchmarkEnvVars();
char const separator = ev.outputToCsv ? ',' : ' ';
printf("Bytes Per Direction%c%lu\n", separator, N * sizeof(float));
// Collect the number of available CPUs/GPUs on this machine
int const numCpus = ev.numCpuDevices;
int const numGpus = ev.numGpuDevices;
int const numDevices = numCpus + numGpus;
// Enable peer to peer for each GPU
for (int i = 0; i < numGpus; i++)
for (int j = 0; j < numGpus; j++)
if (i != j) EnablePeerAccess(i, j);
// Perform unidirectional / bidirectional
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
{
if (ev.p2pMode == 1 && isBidirectional == 1 ||
ev.p2pMode == 2 && isBidirectional == 0) continue;
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
ev.useRemoteRead ? "Remote" : "Local",
ev.useRemoteRead ? "Local" : "Remote",
ev.useDmaCopy ? "DMA" : "GFX");
// Print header
if (isBidirectional)
{
printf("%12s", "SRC\\DST");
}
else
{
if (ev.useRemoteRead)
printf("%12s", "SRC\\EXE+DST");
else
printf("%12s", "SRC+EXE\\DST");
}
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numCpus; i++)
{
printf("%7s %02d", "CPU", i);
if (ev.outputToCsv) printf(",");
}
if (numCpus > 0) printf(" ");
for (int i = 0; i < numGpus; i++)
{
printf("%7s %02d", "GPU", i);
if (ev.outputToCsv) printf(",");
}
printf("\n");
double avgBwSum[2][2] = {};
int avgCount[2][2] = {};
ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
// Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++)
{
MemType const srcType = (src < numCpus ? MEM_CPU : MEM_GPU);
int const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
MemType const srcTypeActual = ((ev.useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
(ev.useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
srcType);
std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> stdDev(isBidirectional + 1);
if (src == numCpus && src != 0) printf("\n");
for (int dst = 0; dst < numDevices; dst++)
{
MemType const dstType = (dst < numCpus ? MEM_CPU : MEM_GPU);
int const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
MemType const dstTypeActual = ((ev.useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
(ev.useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
dstType);
// Prepare Transfers
std::vector<Transfer> transfers(isBidirectional + 1);
// SRC -> DST
transfers[0].numBytes = N * sizeof(float);
transfers[0].srcType.push_back(srcTypeActual);
transfers[0].dstType.push_back(dstTypeActual);
transfers[0].srcIndex.push_back(srcIndex);
transfers[0].dstIndex.push_back(dstIndex);
transfers[0].numSrcs = transfers[0].numDsts = 1;
transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
transfers[0].exeSubIndex = -1;
transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
// DST -> SRC
if (isBidirectional)
{
transfers[1].numBytes = N * sizeof(float);
transfers[1].numSrcs = transfers[1].numDsts = 1;
transfers[1].srcType.push_back(dstTypeActual);
transfers[1].dstType.push_back(srcTypeActual);
transfers[1].srcIndex.push_back(dstIndex);
transfers[1].dstIndex.push_back(srcIndex);
transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
transfers[1].exeSubIndex = -1;
transfers[1].numSubExecs = IsGpuType(transfers[1].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
}
bool skipTest = false;
// Abort if executing on NUMA node with no CPUs
for (int i = 0; i <= isBidirectional; i++)
{
if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
{
skipTest = true;
break;
}
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
{
skipTest = true;
break;
}
#endif
}
if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;
if (!skipTest)
{
ExecuteTransfers(ev, 0, N, transfers, false);
for (int dir = 0; dir <= isBidirectional; dir++)
{
double const avgTime = transfers[dir].transferTime;
double const avgBw = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f;
avgBandwidth[dir].push_back(avgBw);
if (!(srcType == dstType && srcIndex == dstIndex))
{
avgBwSum[srcType][dstType] += avgBw;
avgCount[srcType][dstType]++;
}
if (ev.showIterations)
{
double minTime = transfers[dir].perIterationTime[0];
double maxTime = transfers[dir].perIterationTime[0];
double varSum = 0;
for (int i = 0; i < transfers[dir].perIterationTime.size(); i++)
{
minTime = std::min(minTime, transfers[dir].perIterationTime[i]);
maxTime = std::max(maxTime, transfers[dir].perIterationTime[i]);
double const bw = (transfers[dir].numBytesActual / 1.0E9) / transfers[dir].perIterationTime[i] * 1000.0f;
double const delta = (avgBw - bw);
varSum += delta * delta;
}
double const minBw = (transfers[dir].numBytesActual / 1.0E9) / maxTime * 1000.0f;
double const maxBw = (transfers[dir].numBytesActual / 1.0E9) / minTime * 1000.0f;
double const stdev = sqrt(varSum / transfers[dir].perIterationTime.size());
minBandwidth[dir].push_back(minBw);
maxBandwidth[dir].push_back(maxBw);
stdDev[dir].push_back(stdev);
}
}
}
else
{
for (int dir = 0; dir <= isBidirectional; dir++)
{
avgBandwidth[dir].push_back(0);
minBandwidth[dir].push_back(0);
maxBandwidth[dir].push_back(0);
stdDev[dir].push_back(-1.0);
}
}
}
for (int dir = 0; dir <= isBidirectional; dir++)
{
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
if (ev.outputToCsv) printf(",");
for (int dst = 0; dst < numDevices; dst++)
{
if (dst == numCpus && dst != 0) printf(" ");
double const avgBw = avgBandwidth[dir][dst];
if (avgBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", avgBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
if (ev.showIterations)
{
// minBw
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++)
{
double const minBw = minBandwidth[dir][i];
if (i == numCpus && i != 0) printf(" ");
if (minBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", minBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
// maxBw
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++)
{
double const maxBw = maxBandwidth[dir][i];
if (i == numCpus && i != 0) printf(" ");
if (maxBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", maxBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
// stddev
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++)
{
double const sd = stdDev[dir][i];
if (i == numCpus && i != 0) printf(" ");
if (sd == -1.0)
printf("%10s", "N/A");
else
printf("%10.2f", sd);
if (ev.outputToCsv) printf(",");
}
printf("\n");
}
fflush(stdout);
}
if (isBidirectional)
{
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
if (ev.outputToCsv) printf(",");
for (int dst = 0; dst < numDevices; dst++)
{
double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
if (dst == numCpus && dst != 0) printf(" ");
if (sumBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", sumBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
if (src < numDevices - 1) printf("\n");
}
}
if (!ev.outputToCsv)
{
printf(" ");
for (int srcType : {MEM_CPU, MEM_GPU})
for (int dstType : {MEM_CPU, MEM_GPU})
printf(" %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
printf("\n");
printf("Averages (During %s):", isBidirectional ? " BiDir" : "UniDir");
for (int srcType : {MEM_CPU, MEM_GPU})
for (int dstType : {MEM_CPU, MEM_GPU})
{
if (avgCount[srcType][dstType])
printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
else
printf("%10s", "N/A");
}
printf("\n\n");
}
}
}
void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int const maxSubExecs)
{
ev.DisplayEnvVars();
// Collect the number of available CPUs/GPUs on this machine
int const numCpus = ev.numCpuDevices;
int const numGpus = ev.numGpuDevices;
int const numDevices = numCpus + numGpus;
// Enable peer to peer for each GPU
for (int i = 0; i < numGpus; i++)
for (int j = 0; j < numGpus; j++)
if (i != j) EnablePeerAccess(i, j);
char separator = (ev.outputToCsv ? ',' : ' ');
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.numBytes = N * sizeof(float);
t.numSrcs = 1;
t.numDsts = 1;
t.exeType = EXE_GPU_GFX;
t.exeIndex = exeIndex;
t.exeSubIndex = -1;
t.srcType.resize(1, MEM_GPU);
t.dstType.resize(1, MEM_GPU);
t.srcIndex.resize(1);
t.dstIndex.resize(1);
printf("GPU-GFX Scaling benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes from GPU %d to other devices\n", t.numBytes, exeIndex);
printf("- All numbers reported as GB/sec\n\n");
printf("NumCUs");
for (int i = 0; i < numDevices; i++)
printf("%c %s%02d ", separator, i < numCpus ? "CPU" : "GPU", i < numCpus ? i : i - numCpus);
printf("\n");
std::vector<std::pair<double, int>> bestResult(numDevices);
for (int numSubExec = 1; numSubExec <= maxSubExecs; numSubExec++)
{
t.numSubExecs = numSubExec;
printf("%4d ", numSubExec);
for (int i = 0; i < numDevices; i++)
{
t.dstType[0] = i < numCpus ? MEM_CPU : MEM_GPU;
t.dstIndex[0] = i < numCpus ? i : i - numCpus;
ExecuteTransfers(ev, 0, N, transfers, false);
printf("%c%7.2f ", separator, t.transferBandwidth);
if (t.transferBandwidth > bestResult[i].first)
{
bestResult[i].first = t.transferBandwidth;
bestResult[i].second = numSubExec;
}
}
printf("\n");
}
printf(" Best ");
for (int i = 0; i < numDevices; i++)
{
printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
}
printf("\n");
}
void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs)
{
ev.DisplayA2AEnvVars();
// Collect the number of GPU devices to use
int const numGpus = ev.numGpuDevices;
// Enable peer to peer for each GPU
for (int i = 0; i < numGpus; i++)
for (int j = 0; j < numGpus; j++)
if (i != j) EnablePeerAccess(i, j);
char separator = (ev.outputToCsv ? ',' : ' ');
Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.numSubExecs = numSubExecs;
transfer.numSrcs = ev.a2aMode == 2 ? 0 : 1;
transfer.numDsts = ev.a2aMode == 1 ? 0 : 1;
transfer.exeType = (ev.useDmaCopy && transfer.numSrcs == 1 && transfer.numDsts == 1) ? EXE_GPU_DMA : EXE_GPU_GFX;
transfer.exeSubIndex = -1;
transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
transfer.dstType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
transfer.srcIndex.resize(1);
transfer.dstIndex.resize(1);
std::vector<Transfer> transfers;
for (int i = 0; i < numGpus; i++)
{
transfer.srcIndex[0] = i;
for (int j = 0; j < numGpus; j++)
{
transfer.dstIndex[0] = j;
transfer.exeIndex = (ev.useRemoteRead ? j : i);
if (ev.a2aDirect)
{
if (i == j) continue;
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false),
RemappedIndex(j, false),
&linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
numBytesPerTransfer, ev.a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
if (transfers.size() == 0) return;
double totalBandwidthCpu = 0;
ExecuteTransfers(ev, 0, numBytesPerTransfer / sizeof(float), transfers, !ev.hideEnv, &totalBandwidthCpu);
printf("\nSummary:\n");
printf("==========================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
printf(" %cSTotal %cActual\n", separator, separator);
std::map<std::pair<int, int>, int> reIndex;
for (int i = 0; i < transfers.size(); i++)
{
Transfer const& t = transfers[i];
reIndex[std::make_pair(t.srcIndex[0], t.dstIndex[0])] = i;
}
double totalBandwidthGpu = 0.0;
double minExecutorBandwidth = std::numeric_limits<double>::max();
double maxExecutorBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
for (int src = 0; src < numGpus; src++)
{
double rowTotalBandwidth = 0;
double executorBandwidth = 0;
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++)
{
if (reIndex.count(std::make_pair(src, dst)))
{
Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]];
colTotalBandwidth[dst] += transfer.transferBandwidth;
rowTotalBandwidth += transfer.transferBandwidth;
totalBandwidthGpu += transfer.transferBandwidth;
executorBandwidth = std::max(executorBandwidth, transfer.executorBandwidth);
printf("%c%8.3f ", separator, transfer.transferBandwidth);
}
else
{
printf("%c%8s ", separator, "N/A");
}
}
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
colTotalBandwidth[numGpus] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++)
{
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus],
separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
printf("\n");
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", totalBandwidthCpu);
}
void Transfer::PrepareSubExecParams(EnvVars const& ev)
{
// Each subExecutor needs to know src/dst pointers and how many elements to transfer
// Figure out the sub-array each subExecutor works on for this Transfer
// - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
// except the very last one, for alignment reasons
size_t const N = this->numBytesActual / sizeof(float);
int const initOffset = ev.byteOffset / sizeof(float);
int const targetMultiple = ev.blockBytes / sizeof(float);
// In some cases, there may not be enough data for all subExectors
int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
this->subExecParam.clear();
this->subExecParam.resize(this->numSubExecs);
size_t assigned = 0;
for (int i = 0; i < this->numSubExecs; ++i)
{
SubExecParam& p = this->subExecParam[i];
p.numSrcs = this->numSrcs;
p.numDsts = this->numDsts;
if (ev.gfxSingleTeam && this->exeType == EXE_GPU_GFX)
{
p.N = N;
p.teamSize = this->numSubExecs;
p.teamIdx = i;
for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset;
for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset;
}
else
{
int const subExecLeft = std::max(0, maxSubExecToUse - i);
size_t const leftover = N - assigned;
size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple;
p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
p.teamSize = 1;
p.teamIdx = 0;
for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset + assigned;
for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset + assigned;
assigned += p.N;
}
p.preferredXccId = -1;
if (ev.useXccFilter && this->exeType == EXE_GPU_GFX)
{
std::uniform_int_distribution<int> distribution(0, ev.xccIdsPerDevice[this->exeIndex].size() - 1);
// Use this tranfer's executor subIndex if set
if (this->exeSubIndex != -1)
{
p.preferredXccId = this->exeSubIndex;
}
else if (this->numDsts >= 1 && IsGpuType(this->dstType[0]))
{
p.preferredXccId = ev.prefXccTable[this->exeIndex][this->dstIndex[0]];
}
if (p.preferredXccId == -1)
{
p.preferredXccId = distribution(*ev.generator);
}
}
if (ev.enableDebug)
{
printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
this->transferIndex, i, p.N, assigned, assigned + p.N);
}
p.startCycle = 0;
p.stopCycle = 0;
}
this->transferTime = 0.0;
this->perIterationTime.clear();
}
void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
{
size_t N = buffer.size();
if (bufferIdx >= 0)
{
size_t patternLen = ev.fillPattern.size();
if (patternLen > 0)
{
for (size_t i = 0; i < N; ++i)
buffer[i] = ev.fillPattern[i % patternLen];
}
else
{
for (size_t i = 0; i < N; ++i)
buffer[i] = PrepSrcValue(bufferIdx, i);
}
}
else // Destination buffer
{
if (this->numSrcs == 0)
{
// Note: 0x75757575 = 13323083.0
memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
}
else
{
PrepareReference(ev, buffer, 0);
if (this->numSrcs > 1)
{
std::vector<float> temp(N);
for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
{
PrepareReference(ev, temp, srcIdx);
for (int i = 0; i < N; ++i)
{
buffer[i] += temp[i];
}
}
}
}
}
}
bool Transfer::PrepareSrc(EnvVars const& ev)
{
if (this->numSrcs == 0) return true;
size_t const N = this->numBytesActual / sizeof(float);
int const initOffset = ev.byteOffset / sizeof(float);
std::vector<float> reference(N);
for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
{
float* srcPtr = this->srcMem[srcIdx] + initOffset;
PrepareReference(ev, reference, srcIdx);
// Initialize source memory array with reference pattern
if (IsGpuType(this->srcType[srcIdx]))
{
int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
HIP_CALL(hipSetDevice(deviceIdx));
if (ev.usePrepSrcKernel)
PrepSrcDataKernel<<<32, ev.gfxBlockSize>>>(srcPtr, N, srcIdx);
else
HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
HIP_CALL(hipDeviceSynchronize());
}
else if (IsCpuType(this->srcType[srcIdx]))
{
memcpy(srcPtr, reference.data(), this->numBytesActual);
}
// Perform check just to make sure that data has been copied properly
float* srcCheckPtr = srcPtr;
std::vector<float> srcCopy(N);
if (IsGpuType(this->srcType[srcIdx]))
{
if (!ev.validateDirect)
{
HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
HIP_CALL(hipDeviceSynchronize());
srcCheckPtr = srcCopy.data();
}
}
for (size_t i = 0; i < N; ++i)
{
if (reference[i] != srcCheckPtr[i])
{
printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
#if !defined(__NVCC__)
float const val = this->srcMem[srcIdx][initOffset + i];
printf("[ERROR] SRC %02d value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
#else
printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
#endif
printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
this->transferIndex,
this->SrcToStr().c_str(),
ExeTypeStr[this->exeType], this->exeIndex,
this->numSubExecs,
this->DstToStr().c_str());
printf("[ERROR] Possible cause is misconfigured IOMMU (AMD Instinct cards require amd_iommu=on and iommu=pt)\n");
printf("[ERROR] Please see https://community.amd.com/t5/knowledge-base/iommu-advisory-for-amd-instinct/ta-p/484601 for more details\n");
if (!ev.continueOnError)
exit(1);
return false;
}
}
}
return true;
}
void Transfer::ValidateDst(EnvVars const& ev)
{
if (this->numDsts == 0) return;
size_t const N = this->numBytesActual / sizeof(float);
int const initOffset = ev.byteOffset / sizeof(float);
std::vector<float> reference(N);
PrepareReference(ev, reference, -1);
std::vector<float> hostBuffer(N);
for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
{
float* output;
if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
{
output = this->dstMem[dstIdx] + initOffset;
}
else
{
int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
HIP_CALL(hipSetDevice(deviceIdx));
HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
HIP_CALL(hipDeviceSynchronize());
output = hostBuffer.data();
}
for (size_t i = 0; i < N; ++i)
{
if (reference[i] != output[i])
{
printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
{
float srcVal;
HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
#if !defined(__NVCC__)
float val = this->srcMem[srcIdx][initOffset + i];
printf("[ERROR] SRC %02dD value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
#else
printf("[ERROR] SRC %02d value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
#endif
}
printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
#if !defined(__NVCC__)
float dstVal = this->dstMem[dstIdx][initOffset + i];
printf("[ERROR] DST %02d value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
#else
printf("[ERROR] DST %02d value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
#endif
printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
this->transferIndex,
this->SrcToStr().c_str(),
ExeTypeStr[this->exeType], this->exeIndex,
this->numSubExecs,
this->DstToStr().c_str());
if (!ev.continueOnError)
exit(1);
else
break;
}
}
}
}
std::string Transfer::SrcToStr() const
{
if (numSrcs == 0) return "N";
std::stringstream ss;
for (int i = 0; i < numSrcs; ++i)
ss << MemTypeStr[srcType[i]] << srcIndex[i];
return ss.str();
}
std::string Transfer::DstToStr() const
{
if (numDsts == 0) return "N";
std::stringstream ss;
for (int i = 0; i < numDsts; ++i)
ss << MemTypeStr[dstType[i]] << dstIndex[i];
return ss.str();
}
void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
{
char memType = ev.useFineGrain ? 'F' : 'G';
printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
printf(" | Local Read | Local Write | Local Copy | Remote Read | Remote Write| Remote Copy |\n");
printf(" #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
memType, localIdx, localIdx,
localIdx, memType, localIdx,
memType, localIdx, localIdx, memType, localIdx,
memType, remoteIdx, localIdx,
localIdx, memType, remoteIdx,
memType, localIdx, localIdx, memType, remoteIdx);
printf("|------|-------------|-------------|-------------|-------------|-------------|-------------|\n");
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeType = EXE_GPU_GFX;
t.exeIndex = localIdx;
t.exeSubIndex = -1;
t.numBytes = numBytesPerTransfer;
for (int numCUs = 1; numCUs <= maxSubExecs; numCUs++)
{
t.numSubExecs = numCUs;
// Local Read
t.numSrcs = 1;
t.numDsts = 0;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex[0] = localIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const localRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Local Write
t.numSrcs = 0;
t.numDsts = 1;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex[0] = localIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const localWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Local Copy
t.numSrcs = 1;
t.numDsts = 1;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex[0] = localIdx;
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex[0] = localIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const localCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Remote Read
t.numSrcs = 1;
t.numDsts = 0;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex[0] = remoteIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const remoteRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Remote Write
t.numSrcs = 0;
t.numDsts = 1;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex[0] = remoteIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const remoteWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Remote Copy
t.numSrcs = 1;
t.numDsts = 1;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex[0] = localIdx;
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex[0] = remoteIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const remoteCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
printf(" %3d %11.3f %11.3f %11.3f %11.3f %11.3f %11.3f \n",
numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
}
}
void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
{
printf("Bytes to %s: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n",
ev.useRemoteRead ? "read" : "write", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
char sep = (ev.outputToCsv ? ',' : ' ');
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (i == srcIdx) continue;
printf(" GPU %-3d %c", i, sep);
}
printf("\n");
if (!ev.outputToCsv)
{
for (int i = 0; i < ev.numGpuDevices-1; i++)
{
printf("-------------");
}
printf("\n");
}
for (int p = minGpus; p <= maxGpus; p++)
{
for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
{
if (bitmask & (1<<srcIdx)) continue;
if (__builtin_popcount(bitmask) == p)
{
std::vector<Transfer> transfers;
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (bitmask & (1<<i))
{
Transfer t;
t.exeType = EXE_GPU_GFX;
t.exeSubIndex = -1;
t.numSubExecs = numSubExecs;
t.numBytes = numBytesPerTransfer;
if (ev.useRemoteRead)
{
t.numSrcs = 1;
t.numDsts = 0;
t.exeIndex = i;
t.srcType.resize(1);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex.resize(1);
t.srcIndex[0] = srcIdx;
}
else
{
t.numSrcs = 0;
t.numDsts = 1;
t.exeIndex = srcIdx;
t.dstType.resize(1);
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex.resize(1);
t.dstIndex[0] = i;
}
transfers.push_back(t);
}
}
ExecuteTransfers(ev, 0, 0, transfers, false);
int counter = 0;
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (bitmask & (1<<i))
printf(" %8.3f %c", transfers[counter++].transferBandwidth, sep);
else if (i != srcIdx)
printf(" %c", sep);
}
printf(" %d %d", p, numSubExecs);
for (auto i = 0; i < transfers.size(); i++)
{
printf(" (%s %c%d %s)",
transfers[i].SrcToStr().c_str(),
ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
transfers[i].DstToStr().c_str());
}
printf("\n");
}
}
}
}
void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
{
if (ev.useDmaCopy)
printf("Bytes to copy: %lu from GPU %d using DMA [Sweeping %d to %d parallel writes]\n",
numBytesPerTransfer, srcIdx, minGpus, maxGpus);
else
printf("Bytes to copy: %lu from GPU %d using GFX (%d CUs) [Sweeping %d to %d parallel writes]\n",
numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
char sep = (ev.outputToCsv ? ',' : ' ');
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (i == srcIdx) continue;
printf(" GPU %-3d %c", i, sep);
}
printf("\n");
if (!ev.outputToCsv)
{
for (int i = 0; i < ev.numGpuDevices-1; i++)
{
printf("-------------");
}
printf("\n");
}
for (int p = minGpus; p <= maxGpus; p++)
{
for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
{
if (bitmask & (1<<srcIdx)) continue;
if (__builtin_popcount(bitmask) == p)
{
std::vector<Transfer> transfers;
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (bitmask & (1<<i))
{
Transfer t;
t.exeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
t.exeSubIndex = -1;
t.numSubExecs = ev.useDmaCopy ? 1 : numSubExecs;
t.numBytes = numBytesPerTransfer;
t.numSrcs = 1;
t.numDsts = 1;
t.exeIndex = srcIdx;
t.srcType.resize(1);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex.resize(1);
t.srcIndex[0] = srcIdx;
t.dstType.resize(1);
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex.resize(1);
t.dstIndex[0] = i;
transfers.push_back(t);
}
}
ExecuteTransfers(ev, 0, 0, transfers, false);
int counter = 0;
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (bitmask & (1<<i))
printf(" %8.3f %c", transfers[counter++].transferBandwidth, sep);
else if (i != srcIdx)
printf(" %c", sep);
}
printf(" %d %d", p, numSubExecs);
for (auto i = 0; i < transfers.size(); i++)
{
printf(" (%s %c%d %s)",
transfers[i].SrcToStr().c_str(),
ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
transfers[i].DstToStr().c_str());
}
printf("\n");
}
}
}
}
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
{
ev.DisplaySweepEnvVars();
// Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
std::vector<std::pair<ExeType, int>> exeList;
for (auto exe : ev.sweepExe)
{
ExeType const exeType = CharToExeType(exe);
if (IsGpuType(exeType))
{
for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
exeList.push_back(std::make_pair(exeType, exeIndex));
}
else if (IsCpuType(exeType))
{
for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
{
// Skip NUMA nodes that have no CPUs (e.g. CXL)
if (ev.numCpusPerNuma[exeIndex] == 0) continue;
exeList.push_back(std::make_pair(exeType, exeIndex));
}
}
}
int numExes = exeList.size();
std::vector<std::pair<MemType, int>> srcList;
for (auto src : ev.sweepSrc)
{
MemType const srcType = CharToMemType(src);
int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
srcList.push_back(std::make_pair(srcType, srcIndex));
}
int numSrcs = srcList.size();
std::vector<std::pair<MemType, int>> dstList;
for (auto dst : ev.sweepDst)
{
MemType const dstType = CharToMemType(dst);
int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
dstList.push_back(std::make_pair(dstType, dstIndex));
}
int numDsts = dstList.size();
// Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
struct TransferInfo
{
MemType srcType; int srcIndex;
ExeType exeType; int exeIndex;
MemType dstType; int dstIndex;
};
// If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
std::vector<TransferInfo> possibleTransfers;
TransferInfo tinfo;
for (int i = 0; i < numExes; ++i)
{
// Skip CPU executors if XGMI link must be used
if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
tinfo.exeType = exeList[i].first;
tinfo.exeIndex = exeList[i].second;
bool isXgmiSrc = false;
int numHopsSrc = 0;
for (int j = 0; j < numSrcs; ++j)
{
if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
{
if (exeList[i].second != srcList[j].second)
{
#if defined(__NVCC__)
isXgmiSrc = false;
#else
uint32_t exeToSrcLinkType, exeToSrcHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
RemappedIndex(srcList[j].second, false),
&exeToSrcLinkType,
&exeToSrcHopCount));
isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
#endif
}
else
{
isXgmiSrc = true;
numHopsSrc = 0;
}
// Skip this SRC if it is not XGMI but only XGMI links may be used
if (useXgmiOnly && !isXgmiSrc) continue;
// Skip this SRC if XGMI distance is already past limit
if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
}
else if (useXgmiOnly) continue;
tinfo.srcType = srcList[j].first;
tinfo.srcIndex = srcList[j].second;
bool isXgmiDst = false;
int numHopsDst = 0;
for (int k = 0; k < numDsts; ++k)
{
if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
{
if (exeList[i].second != dstList[k].second)
{
#if defined(__NVCC__)
isXgmiSrc = false;
#else
uint32_t exeToDstLinkType, exeToDstHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
RemappedIndex(dstList[k].second, false),
&exeToDstLinkType,
&exeToDstHopCount));
isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiDst) numHopsDst = exeToDstHopCount;
#endif
}
else
{
isXgmiDst = true;
numHopsDst = 0;
}
}
// Skip this DST if it is not XGMI but only XGMI links may be used
if (useXgmiOnly && !isXgmiDst) continue;
// Skip this DST if total XGMI distance (SRC + DST) is less than min limit
if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
// Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
#if defined(__NVCC__)
// Skip CPU executors on GPU memory on NVIDIA platform
if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
continue;
#endif
tinfo.dstType = dstList[k].first;
tinfo.dstIndex = dstList[k].second;
possibleTransfers.push_back(tinfo);
}
}
}
int const numPossible = (int)possibleTransfers.size();
int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
if (ev.sweepMin > numPossible)
{
printf("No valid test configurations exist\n");
return;
}
if (ev.outputToCsv)
{
printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
"ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
}
int numTestsRun = 0;
int M = ev.sweepMin;
std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
// Log sweep to configuration file
FILE *fp = fopen("lastSweep.cfg", "w");
if (!fp)
{
printf("[ERROR] Unable to open lastSweep.cfg. Check permissions\n");
exit(1);
}
// Create bitmask of numPossible triplets, of which M will be chosen
std::string bitmask(M, 1); bitmask.resize(numPossible, 0);
auto cpuStart = std::chrono::high_resolution_clock::now();
while (1)
{
if (isRandom)
{
// Pick random number of simultaneous transfers to execute
// NOTE: This currently skews distribution due to some #s having more possibilities than others
M = distribution(*ev.generator);
// Generate a random bitmask
for (int i = 0; i < numPossible; i++)
bitmask[i] = (i < M) ? 1 : 0;
std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
}
// Convert bitmask to list of Transfers
std::vector<Transfer> transfers;
for (int value = 0; value < numPossible; ++value)
{
if (bitmask[value])
{
// Convert integer value to (SRC->EXE->DST) triplet
Transfer transfer;
transfer.numSrcs = 1;
transfer.numDsts = 1;
transfer.srcType = {possibleTransfers[value].srcType};
transfer.srcIndex = {possibleTransfers[value].srcIndex};
transfer.exeType = possibleTransfers[value].exeType;
transfer.exeIndex = possibleTransfers[value].exeIndex;
transfer.exeSubIndex = -1;
transfer.dstType = {possibleTransfers[value].dstType};
transfer.dstIndex = {possibleTransfers[value].dstIndex};
transfer.numSubExecs = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
transfer.numBytes = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
transfers.push_back(transfer);
}
}
LogTransfers(fp, ++numTestsRun, transfers);
ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
// Check for test limit
if (numTestsRun == ev.sweepTestLimit)
{
printf("Test limit reached\n");
break;
}
// Check for time limit
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
{
printf("Time limit exceeded\n");
break;
}
// Increment bitmask if not random sweep
if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
{
M++;
// Check for completion
if (M > maxParallelTransfers)
{
printf("Sweep complete\n");
break;
}
for (int i = 0; i < numPossible; i++)
bitmask[i] = (i < M) ? 1 : 0;
}
}
fclose(fp);
}
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
{
fprintf(fp, "# Test %d\n", testNum);
fprintf(fp, "%d", -1 * (int)transfers.size());
for (auto const& transfer : transfers)
{
fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
ExeTypeStr[transfer.exeType], transfer.exeIndex,
MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
transfer.numSubExecs,
transfer.numBytes);
}
fprintf(fp, "\n");
fflush(fp);
}
std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
{
std::stringstream ss;
for (int i = 0; i < strVector.size(); ++i)
{
if (i) ss << " ";
ss << (strVector[i] + initOffset);
}
return ss.str();
}
void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results)
{
char sep = ev.outputToCsv ? ',' : '|';
size_t numTimedIterations = results.numTimedIterations;
// Loop over each executor
for (auto exeInfoPair : results.exeResults) {
ExeResult const& exeResult = exeInfoPair.second;
ExeType exeType = exeInfoPair.first.first;
int exeIndex = exeInfoPair.first.second;
printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.bandwidthGbs, sep,
exeResult.durationMsec, sep, exeResult.totalBytes, sep, exeResult.sumBandwidthGbs);
for (int idx : exeResult.transferIdx) {
Transfer const& t = transfers[idx];
char exeSubIndexStr[32] = "";
if (ev.useXccFilter || t.exeType == EXE_GPU_DMA) {
if (t.exeSubIndex == -1)
sprintf(exeSubIndexStr, ".*");
else
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
}
printf(" Transfer %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
t.transferIndex, sep,
t.transferBandwidth, sep,
t.transferTime, sep,
t.numBytesActual, sep,
t.SrcToStr().c_str(),
ExeTypeName[t.exeType], t.exeIndex,
exeSubIndexStr,
t.numSubExecs,
t.DstToStr().c_str());
// Show per-iteration timing information
if (ev.showIterations) {
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++) {
times.insert(std::make_pair(t.perIterationTime[i], i+1));
double const varTime = fabs(t.transferTime - t.perIterationTime[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / t.perIterationTime[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - t.transferBandwidth);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
for (auto time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
std::set<int> usedXccs;
if (time.second - 1 < t.perIterationCUs.size()) {
printf(" CUs:");
for (auto x : t.perIterationCUs[time.second - 1]) {
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
}
}
}
printf(" Aggregate (CPU) %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n", sep,
results.totalBandwidthCpu, sep, results.totalDurationMsec, sep, results.totalBytesTransferred, sep, results.overheadMsec);
}
void RunHealthCheck(EnvVars ev)
{
// Check for supported platforms
#if defined(__NVCC__)
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
return;
#else
bool hasFail = false;
// Force use of single stream
ev.useSingleStream = 1;
for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
{
printf("[WARN] healthcheck preset is currently only supported on MI300 series hardware\n");
exit(1);
}
}
// Pass limits
double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
double a2aLimit = getenv("LIMIT_A2A") ? atof(getenv("LIMIT_A2A")) : (int)(45 * 0.95);
// Run CPU to GPU
// Run unidirectional read from CPU to GPU
printf("Testing unidirectional reads from CPU ");
{
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeType = EXE_GPU_GFX;
t.exeIndex = gpuId;
t.numBytes = 64*1024*1024;
t.numBytesActual = 64*1024*1024;
t.numSrcs = 1;
t.srcType.push_back(MEM_CPU);
t.srcIndex.push_back(GetClosestNumaNode(gpuId));
t.numDsts = 0;
t.dstType.clear();
t.dstIndex.clear();
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
bestResult = std::max(bestResult, t.transferBandwidth);
if (t.transferBandwidth >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run unidirectional write from GPU to CPU
printf("Testing unidirectional writes to CPU ");
{
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeType = EXE_GPU_GFX;
t.exeIndex = gpuId;
t.numBytes = 64*1024*1024;
t.numBytesActual = 64*1024*1024;
t.numDsts = 1;
t.dstType.push_back(MEM_CPU);
t.dstIndex.push_back(GetClosestNumaNode(gpuId));
t.numSrcs = 0;
t.srcType.clear();
t.srcIndex.clear();
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
bestResult = std::max(bestResult, t.transferBandwidth);
if (t.transferBandwidth >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run bidirectional tests
printf("Testing bidirectional reads + writes ");
{
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
std::vector<Transfer> transfers(2);
Transfer& t0 = transfers[0];
Transfer& t1 = transfers[1];
t0.exeType = EXE_GPU_GFX;
t0.exeIndex = gpuId;
t0.numBytes = 64*1024*1024;
t0.numBytesActual = 64*1024*1024;
t0.numSrcs = 1;
t0.srcType.push_back(MEM_CPU);
t0.srcIndex.push_back(GetClosestNumaNode(gpuId));
t0.numDsts = 0;
t0.dstType.clear();
t0.dstIndex.clear();
t1.exeType = EXE_GPU_GFX;
t1.exeIndex = gpuId;
t1.numBytes = 64*1024*1024;
t1.numBytesActual = 64*1024*1024;
t1.numDsts = 1;
t1.dstType.push_back(MEM_CPU);
t1.dstIndex.push_back(GetClosestNumaNode(gpuId));
t1.numSrcs = 0;
t1.srcType.clear();
t1.srcIndex.clear();
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t0.numSubExecs = cu;
t1.numSubExecs = cu;
TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
double sum = t0.transferBandwidth + t1.transferBandwidth;
bestResult = std::max(bestResult, sum);
if (sum >= bdirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
}
}
}
// Run XGMI tests:
printf("Testing all-to-all XGMI copies "); fflush(stdout);
{
ev.gfxUnroll = 2;
std::vector<Transfer> transfers;
for (int i = 0; i < ev.numGpuDevices; i++) {
for (int j = 0; j < ev.numGpuDevices; j++) {
if (i == j) continue;
Transfer t;
t.exeType = EXE_GPU_GFX;
t.exeIndex = i;
t.numBytes = t.numBytesActual = 64*1024*1024;
t.numSrcs = 1;
t.numDsts = 1;
t.numSubExecs = 8;
t.srcType.push_back(MEM_GPU_FINE);
t.dstType.push_back(MEM_GPU_FINE);
t.srcIndex.push_back(i);
t.dstIndex.push_back(j);
transfers.push_back(t);
}
}
TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
std::vector<std::pair<std::pair<int,int>, double>> fails;
int transferIdx = 0;
for (int i = 0; i < ev.numGpuDevices; i++) {
printf("."); fflush(stdout);
for (int j = 0; j < ev.numGpuDevices; j++) {
if (i == j) continue;
Transfer const& t = transfers[transferIdx];
if (t.transferBandwidth < a2aLimit) {
fails.push_back(std::make_pair(std::make_pair(i,j), t.transferBandwidth));
}
transferIdx++;
}
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
}
}
}
exit(hasFail ? 1 : 0);
#endif
}
/*
Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "Client.hpp"
#include "Presets.hpp"
#include "Topology.hpp"
#include <fstream>
int main(int argc, char **argv) {
// Collect environment variables
EnvVars ev;
// Display usage instructions and detected topology
if (argc <= 1) {
if (!ev.outputToCsv) {
DisplayUsage(argv[0]);
DisplayPresets();
}
DisplayTopology(ev.outputToCsv);
exit(0);
}
// Determine number of bytes to run per Transfer
size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
if (argc > 2) {
// Adjust bytes if unit specified
char units = argv[2][strlen(argv[2])-1];
switch (units) {
case 'G': case 'g': numBytesPerTransfer *= 1024;
case 'M': case 'm': numBytesPerTransfer *= 1024;
case 'K': case 'k': numBytesPerTransfer *= 1024;
}
}
if (numBytesPerTransfer % 4) {
printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
exit(1);
}
// Run preset benchmark if requested
if (RunPreset(ev, numBytesPerTransfer, argc, argv)) exit(0);
// Read input from command line or configuration file
std::vector<std::string> lines;
{
std::string line;
if (!strcmp(argv[1], "cmdline")) {
for (int i = 3; i < argc; i++)
line += std::string(argv[i]) + " ";
lines.push_back(line);
} else {
std::ifstream cfgFile(argv[1]);
if (!cfgFile.is_open()) {
printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
exit(1);
}
while (std::getline(cfgFile, line))
lines.push_back(line);
cfgFile.close();
}
}
// Print environment variables and CSV header
ev.DisplayEnvVars();
if (ev.outputToCsv)
printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
TransferBench::ConfigOptions cfgOptions = ev.ToConfigOptions();
TransferBench::TestResults results;
std::vector<ErrResult> errors;
// Process each line as a Test
int testNum = 0;
for (std::string const &line : lines) {
// Check if line is a comment to be echoed to output (starts with ##)
if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s\n", line.c_str());
// Parse set of parallel Transfers to execute
std::vector<Transfer> transfers;
CheckForError(TransferBench::ParseTransfers(line, transfers));
if (transfers.empty()) continue;
// Check for variable sub-executors Transfers
int numVariableTransfers = 0;
int maxVarCount = 0;
{
std::map<ExeDevice, int> varTransferCount;
for (auto const& t : transfers) {
if (t.numSubExecs == 0) {
if (t.exeDevice.exeType != EXE_GPU_GFX) {
printf("[ERROR] Variable number of subexecutors is only supported on GFX executors\n");
exit(1);
}
numVariableTransfers++;
varTransferCount[t.exeDevice]++;
maxVarCount = max(maxVarCount, varTransferCount[t.exeDevice]);
}
}
if (numVariableTransfers > 0 && numVariableTransfers != transfers.size()) {
printf("[ERROR] All or none of the Transfers in the Test must use variable number of Subexecutors\n");
exit(1);
}
}
// Run the specified numbers of bytes otherwise generate a range of values
for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
do {
for (auto& t : transfers)
t.numBytes = currBytes;
if (maxVarCount == 0) {
if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
PrintResults(ev, ++testNum, transfers, results);
}
PrintErrors(results.errResults);
} else {
// Variable subexecutors - Determine how many subexecutors to sweep up to
int maxNumVarSubExec = ev.maxNumVarSubExec;
if (maxNumVarSubExec == 0) {
maxNumVarSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}) / maxVarCount;
}
TransferBench::TestResults bestResults;
std::vector<Transfer> bestTransfers;
for (int numSubExecs = ev.minNumVarSubExec; numSubExecs <= maxNumVarSubExec; numSubExecs++) {
std::vector<Transfer> tempTransfers = transfers;
for (auto& t : tempTransfers) {
if (t.numSubExecs == 0) t.numSubExecs = numSubExecs;
}
TransferBench::TestResults tempResults;
if (!TransferBench::RunTransfers(cfgOptions, tempTransfers, tempResults)) {
PrintErrors(tempResults.errResults);
} else {
if (tempResults.avgTotalBandwidthGbPerSec > bestResults.avgTotalBandwidthGbPerSec) {
bestResults = tempResults;
bestTransfers = tempTransfers;
}
}
}
PrintResults(ev, ++testNum, bestTransfers, bestResults);
PrintErrors(bestResults.errResults);
}
if (numBytesPerTransfer != 0) break;
currBytes += deltaBytes;
} while (currBytes < bytes * 2);
if (numBytesPerTransfer != 0) break;
}
}
}
void DisplayUsage(char const* cmdName)
{
printf("TransferBench v%s.%s\n", TransferBench::VERSION, CLIENT_VERSION);
printf("========================================\n");
if (numa_available() == -1) {
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
printf("Usage: %s config <N>\n", cmdName);
printf(" config: Either:\n");
printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
printf(" - Name of preset config:\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
printf("\n");
EnvVars::DisplayUsage();
}
std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices) {
if (memDevices.empty()) return "N";
std::stringstream ss;
for (auto const& m : memDevices)
ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
return ss.str();
}
void PrintResults(EnvVars const& ev, int const testNum,
std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results)
{
char sep = ev.outputToCsv ? ',' : '|';
size_t numTimedIterations = results.numTimedIterations;
if (!ev.outputToCsv) printf("Test %d:\n", testNum);
// Loop over each executor
for (auto exeInfoPair : results.exeResults) {
ExeDevice const& exeDevice = exeInfoPair.first;
ExeResult const& exeResult = exeInfoPair.second;
ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex;
printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);
// Loop over each executor
for (int idx : exeResult.transferIdx) {
Transfer const& t = transfers[idx];
TransferResult const& r = results.tfrResults[idx];
char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1)
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
printf(" Transfer %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
idx, sep,
r.avgBandwidthGbPerSec, sep,
r.avgDurationMsec, sep,
r.numBytes, sep,
MemDevicesToStr(t.srcs).c_str(), ExeTypeName[exeType], exeIndex,
exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
// Show per-iteration timing information
if (ev.showIterations) {
// Check that per-iteration information exists
if (r.perIterMsec.size() != numTimedIterations) {
printf("[ERROR] Per iteration timing data unavailable: Expected %lu data points, but have %lu\n",
numTimedIterations, r.perIterMsec.size());
exit(1);
}
// Compute standard deviation and track iterations by speed
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++) {
times.insert(std::make_pair(r.perIterMsec[i], i+1));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
// Loop over iterations (fastest to slowest)
for (auto& time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
std::set<int> usedXccs;
if (time.second - 1 < r.perIterCUs.size()) {
printf(" CUs:");
for (auto x : r.perIterCUs[time.second - 1]) {
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
}
}
}
printf(" Aggregate (CPU) %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
sep, results.avgTotalBandwidthGbPerSec,
sep, results.avgTotalDurationMsec,
sep, results.totalBytesTransferred,
sep, results.overheadMsec);
}
void CheckForError(ErrResult const& error)
{
switch (error.errType) {
case ERR_NONE: return;
case ERR_WARN:
printf("[WARN] %s\n", error.errMsg.c_str());
return;
case ERR_FATAL:
printf("[ERROR] %s\n", error.errMsg.c_str());
exit(1);
default:
break;
}
}
void PrintErrors(std::vector<ErrResult> const& errors)
{
bool isFatal = false;
for (auto const& err : errors) {
printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
isFatal |= (err.errType == ERR_FATAL);
}
if (isFatal) exit(1);
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
// TransferBench client version
#define CLIENT_VERSION "00"
#include "TransferBench.hpp"
#include "EnvVars.hpp"
size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);
char const ExeTypeName[4][4] = {"CPU", "GPU", "DMA", "IBV"};
// Display detected hardware
void DisplayTopology(bool outputToCsv);
// Display usage instructions
void DisplayUsage(char const* cmdName);
// Print TransferBench test results
void PrintResults(EnvVars const& ev, int const testNum,
std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results);
// Helper function that converts MemDevices to a string
std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices);
// Helper function to print warning / exit on fatal error
void CheckForError(ErrResult const& error);
// Helper function to print list of errors
void PrintErrors(std::vector<ErrResult> const& errors);
/*
Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ENVVARS_HPP
#define ENVVARS_HPP
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) { \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \
<< ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#include <algorithm>
#include <iostream>
#include <numa.h>
#include <random>
#include <time.h>
#include "Client.hpp"
#include "TransferBench.hpp"
using namespace TransferBench;
// Redefinitions for CUDA compatibility
//==========================================================================================
#if defined(__NVCC__)
#define hipError_t cudaError_t
#define hipGetErrorString cudaGetErrorString
#define hipDeviceProp_t cudaDeviceProp
#define hipDeviceGetPCIBusId cudaDeviceGetPCIBusId
#define hipGetDeviceProperties cudaGetDeviceProperties
#define hipSuccess cudaSuccess
#define gcnArchName name
#define hipGetDeviceCount cudaGetDeviceCount
#endif
// This class manages environment variable that affect TransferBench
class EnvVars
{
public:
// Default configuration values
int const DEFAULT_SAMPLING_FACTOR = 1;
// Environment variables
// General options
int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead
int numSubIterations; // Number of subiterations to perform
int numWarmups; // Number of un-timed warmup iterations to perform
int showIterations; // Show per-iteration timing info
int useInteractive; // Pause for user-input before starting transfer loop
// Data options
int alwaysValidate; // Validate after each iteration instead of once after all iterations
int blockBytes; // Each subexecutor, except the last, gets a multiple of this many bytes to copy
int byteOffset; // Byte-offset for memory allocations
vector<float> fillPattern; // Pattern of floats used to fill source data
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
int validateSource; // Validate source GPU memory immediately after preparation
// DMA options
int useHsaDma; // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
// GFX options
int gfxBlockSize; // Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask; // Bit-vector representing the CU mask
vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair
int gfxUnroll; // GFX-kernel unroll factor
int useHipEvents; // Use HIP events for timing GFX/DMA Executor
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
int gfxSingleTeam; // Team all subExecutors across the data array
int gfxWaveOrder; // GFX-kernel wavefront ordering
// Client options
int hideEnv; // Skip printing environment variable
int minNumVarSubExec; // Minimum # of subexecutors to use for variable subExec Transfers
int maxNumVarSubExec; // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
int outputToCsv; // Output in CSV format
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
// Developer features
int gpuMaxHwQueues; // Tracks GPU_MAX_HW_QUEUES environment variable
// Constructor that collects values
EnvVars()
{
int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
int numDeviceCUs = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0});
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, 0));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
// Different hardware pick different GPU kernels
// This performance difference is generally only noticable when executing fewer CUs
int defaultGfxUnroll = 4;
if (archName == "gfx906") defaultGfxUnroll = 8;
else if (archName == "gfx90a") defaultGfxUnroll = 8;
else if (archName == "gfx940") defaultGfxUnroll = 6;
else if (archName == "gfx941") defaultGfxUnroll = 6;
else if (archName == "gfx942") defaultGfxUnroll = 4;
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 0);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0);
minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC" , 1);
maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC" , 0);
numIterations = GetEnvVar("NUM_ITERATIONS" , 10);
numSubIterations = GetEnvVar("NUM_SUBITERATIONS" , 1);
numWarmups = GetEnvVar("NUM_WARMUPS" , 3);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , 1);
showIterations = GetEnvVar("SHOW_ITERATIONS" , 0);
useHipEvents = GetEnvVar("USE_HIP_EVENTS" , 1);
useHsaDma = GetEnvVar("USE_HSA_DMA" , 0);
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 1);
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
validateSource = GetEnvVar("VALIDATE_SOURCE" , 0);
gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES" , 4);
// Check for fill pattern
char* pattern = getenv("FILL_PATTERN");
if (pattern != NULL) {
int patternLen = strlen(pattern);
if (patternLen % 2) {
printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
exit(1);
}
// Read in bytes
std::vector<unsigned char> bytes;
unsigned char val = 0;
for (int i = 0; i < patternLen; i++) {
if ('0' <= pattern[i] && pattern[i] <= '9')
val += (pattern[i] - '0');
else if ('A' <= pattern[i] && pattern[i] <= 'F')
val += (pattern[i] - 'A' + 10);
else if ('a' <= pattern[i] && pattern[i] <= 'f')
val += (pattern[i] - 'a' + 10);
else {
printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F). (not %c)\n", pattern[i]);
exit(1);
}
if (i % 2 == 0)
val <<= 4;
else {
bytes.push_back(val);
val = 0;
}
}
// Reverse bytes (input is assumed to be given in big-endian)
std::reverse(bytes.begin(), bytes.end());
// Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
int copies;
switch (patternLen % 8) {
case 0: copies = 1; break;
case 4: copies = 2; break;
default: copies = 4; break;
}
// Fill floats
int numFloats = copies * patternLen / 8;
fillPattern.resize(numFloats);
unsigned char* rawData = (unsigned char*) fillPattern.data();
for (int i = 0; i < numFloats * 4; i++)
rawData[i] = bytes[i % bytes.size()];
}
else fillPattern.clear();
// Check for CU mask
int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
cuMask.clear();
char* cuMaskStr = getenv("CU_MASK");
if (cuMaskStr != NULL) {
#if defined(__NVCC__)
printf("[WARN] CU_MASK is not supported in CUDA\n");
#else
std::vector<std::pair<int, int>> ranges;
int maxCU = 0;
char* token = strtok(cuMaskStr, ",");
while (token) {
int start, end;
if (sscanf(token, "%d-%d", &start, &end) == 2) {
ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
maxCU = std::max(maxCU, std::max(start, end));
} else if (sscanf(token, "%d", &start) == 1) {
ranges.push_back(std::make_pair(start, start));
maxCU = std::max(maxCU, start);
} else {
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
cuMask.resize(2 * numXccs, 0);
for (auto range : ranges) {
for (int i = range.first; i <= range.second; i++) {
for (int x = 0; x < numXccs; x++) {
int targetBit = i * numXccs + x;
cuMask[targetBit/32] |= (1<<(targetBit%32));
}
}
}
#endif
}
// Parse preferred XCC table (if provided)
char* prefXccStr = getenv("XCC_PREF_TABLE");
if (prefXccStr) {
prefXccTable.resize(numDetectedGpus);
for (int i = 0; i < numDetectedGpus; i++){
prefXccTable[i].resize(numDetectedGpus, -1);
}
char* token = strtok(prefXccStr, ",");
int tokenCount = 0;
while (token) {
int xccId;
if (sscanf(token, "%d", &xccId) == 1) {
int src = tokenCount / numDetectedGpus;
int dst = tokenCount % numDetectedGpus;
if (xccId < 0 || xccId >= numXccs) {
printf("[ERROR] XCC index (%d) out of bounds. Expect value less than %d\n", xccId, numXccs);
exit(1);
}
prefXccTable[src][dst] = xccId;
tokenCount++;
if (tokenCount == (numDetectedGpus * numDetectedGpus)) break;
} else {
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
}
}
// Display info on the env vars that can be used
static void DisplayUsage()
{
printf("Environment variables:\n");
printf("======================\n");
printf(" ALWAYS_VALIDATE - Validate after each iteration instead of once after all iterations\n");
printf(" BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n");
printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
printf(" MIN_VAR_SUBEXEC - Minumum # of subexecutors to use for variable subExec Transfers\n");
printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n");
printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
printf(" NUM_WARMUPS - # of untimed warmup iterations per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" SAMPLING_FACTOR - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info\n");
printf(" USE_HIP_EVENTS - Use HIP events for GFX executor timing\n");
printf(" USE_HSA_DMA - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n");
printf(" VALIDATE_SOURCE - Validate GPU src memory immediately after preparation\n");
}
void Print(std::string const& name, int32_t const value, const char* format, ...) const
{
printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : ");
va_list args;
va_start(args, format);
vprintf(format, args);
va_end(args);
printf("\n");
}
void Print(std::string const& name, std::string const& value, const char* format, ...) const
{
printf("%-20s%s%12s%s", name.c_str(), outputToCsv ? "," : " = ", value.c_str(), outputToCsv ? "," : " : ");
va_list args;
va_start(args, format);
vprintf(format, args);
va_end(args);
printf("\n");
}
// Display env var settings
void DisplayEnvVars() const
{
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (!outputToCsv) {
printf("TransferBench v%s.%s\n", TransferBench::VERSION, CLIENT_VERSION);
printf("===============================================================\n");
if (!hideEnv) printf("[Common] (Suppress by setting HIDE_ENV=1)\n");
}
else if (!hideEnv)
printf("EnvVar,Value,Description,(TransferBench Client v%s Backend v%s)\n", CLIENT_VERSION, TransferBench::VERSION);
if (hideEnv) return;
Print("ALWAYS_VALIDATE", alwaysValidate,
"Validating after %s", (alwaysValidate ? "each iteration" : "all iterations"));
Print("BLOCK_BYTES", blockBytes,
"Each CU gets a mulitple of %d bytes to copy", blockBytes);
Print("BYTE_OFFSET", byteOffset,
"Using byte offset of %d", byteOffset);
Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
"%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
"%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
Print("GFX_BLOCK_SIZE", gfxBlockSize,
"Threadblock size of %d", gfxBlockSize);
Print("GFX_SINGLE_TEAM", gfxSingleTeam,
"%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
"Each CUs operates on its own disjoint subarray"));
Print("GFX_UNROLL", gfxUnroll,
"Using GFX unroll factor of %d", gfxUnroll);
Print("GFX_WAVE_ORDER", gfxWaveOrder,
"Using GFX wave ordering of %s", (gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll"));
Print("MIN_VAR_SUBEXEC", minNumVarSubExec,
"Using at least %d subexecutor(s) for variable subExec tranfers", minNumVarSubExec);
Print("MAX_VAR_SUBEXEC", maxNumVarSubExec,
"Using up to %s subexecutors for variable subExec transfers",
maxNumVarSubExec ? std::to_string(maxNumVarSubExec).c_str() : "all available");
Print("NUM_ITERATIONS", numIterations,
(numIterations == 0) ? "Running infinitely" :
"Running %d %s", abs(numIterations), (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
Print("NUM_SUBITERATIONS", numSubIterations,
"Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str());
Print("NUM_WARMUPS", numWarmups,
"Running %d warmup iteration(s) per Test", numWarmups);
Print("SHOW_ITERATIONS", showIterations,
"%s per-iteration timing", showIterations ? "Showing" : "Hiding");
Print("USE_HIP_EVENTS", useHipEvents,
"Using %s for GFX/DMA Executor timing", useHipEvents ? "HIP events" : "CPU wall time");
Print("USE_HSA_DMA", useHsaDma,
"Using %s for DMA execution", useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync");
Print("USE_INTERACTIVE", useInteractive,
"Running in %s mode", useInteractive ? "interactive" : "non-interactive");
Print("USE_SINGLE_STREAM", useSingleStream,
"Using single stream per GFX %s", useSingleStream ? "device" : "Transfer");
if (getenv("XCC_PREF_TABLE")) {
printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
printf("%36s: ", "");
for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
for (int i = 0; i < numGpuDevices; i++) {
printf("%36s: GPU %3d ", "", i);
for (int j = 0; j < numGpuDevices; j++)
printf(" %3d", prefXccTable[i][j]);
printf(" %3d\n", TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}));
}
}
Print("VALIDATE_DIRECT", validateDirect,
"Validate GPU destination memory %s", validateDirect ? "directly" : "via CPU staging buffer");
Print("VALIDATE_SOURCE", validateSource,
validateSource ? "Validate source after preparation" : "Do not perform source validation after prep");
printf("\n");
};
// Helper function that gets parses environment variable or sets to default value
static int GetEnvVar(std::string const& varname, int defaultValue)
{
if (getenv(varname.c_str()))
return atoi(getenv(varname.c_str()));
return defaultValue;
}
static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
{
if (getenv(varname.c_str()))
return getenv(varname.c_str());
return defaultValue;
}
std::string GetCuMaskDesc() const
{
std::vector<std::pair<int, int>> runs;
int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
bool inRun = false;
std::pair<int, int> curr;
int used = 0;
for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
used++;
if (!inRun) {
inRun = true;
curr.first = targetBit / numXccs;
}
} else {
if (inRun) {
inRun = false;
curr.second = targetBit / numXccs - 1;
runs.push_back(curr);
}
}
}
if (inRun)
curr.second = (cuMask.size() * 32) / numXccs - 1;
std::string result = "CUs used: (" + std::to_string(used) + ") ";
for (int i = 0; i < runs.size(); i++)
{
if (i) result += ",";
if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
}
return result;
}
TransferBench::ConfigOptions ToConfigOptions()
{
TransferBench::ConfigOptions cfg;
cfg.general.numIterations = numIterations;
cfg.general.numSubIterations = numSubIterations;
cfg.general.numWarmups = numWarmups;
cfg.general.recordPerIteration = showIterations;
cfg.general.useInteractive = useInteractive;
cfg.data.alwaysValidate = alwaysValidate;
cfg.data.blockBytes = blockBytes;
cfg.data.byteOffset = byteOffset;
cfg.data.validateDirect = validateDirect;
cfg.data.validateSource = validateSource;
cfg.data.fillPattern = fillPattern;
cfg.dma.useHipEvents = useHipEvents;
cfg.dma.useHsaCopy = useHsaDma;
cfg.gfx.blockSize = gfxBlockSize;
cfg.gfx.cuMask = cuMask;
cfg.gfx.prefXccTable = prefXccTable;
cfg.gfx.unrollFactor = gfxUnroll;
cfg.gfx.useHipEvents = useHipEvents;
cfg.gfx.useMultiStream = !useSingleStream;
cfg.gfx.useSingleTeam = gfxSingleTeam;
cfg.gfx.waveOrder = gfxWaveOrder;
return cfg;
}
};
#endif
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "EnvVars.hpp"
void AllToAllPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2
};
char a2aModeStr[3][20] = {"Copy", "Read-Only", "Write-Only"};
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
// Force to gfx unroll 2 unless explicitly set
ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int a2aMode = EnvVars::GetEnvVar("A2A_MODE" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8);
int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[AllToAll Related]\n");
ev.Print("A2A_DIRECT" , a2aDirect , a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE" , a2aMode , a2aModeStr[a2aMode]);
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs);
ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX");
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
printf("\n");
}
// Validate env vars
if (a2aMode < 0 || a2aMode > 2) {
printf("[ERROR] a2aMode must be between 0 and 2\n");
exit(1);
}
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
// Collect the number of GPU devices to use
int const numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
int const numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpus; i++) {
for (int j = 0; j < numGpus; j++) {
// Check whether or not to execute this pair
if (i == j) {
if (!a2aLocal) continue;
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
if (numSrcs) transfer.srcs.push_back({memType, i});
if (numDsts) transfer.dsts.push_back({memType, j});
transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
transfer.exeSubIndex = -1;
transfer.numSubExecs = numSubExecs;
reIndex[std::make_pair(i,j)] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
if (transfers.size() == 0) return;
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults)
printf("%s\n", err.errMsg.c_str());
exit(0);
} else {
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
printf("==========================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minExecutorBandwidth = std::numeric_limits<double>::max();
double maxExecutorBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0;
double executorBandwidth = 0;
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) {
int const transferIdx = reIndex[std::make_pair(src,dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
executorBandwidth = std::max(executorBandwidth,
results.exeResults[transfers[transferIdx].exeDevice].avgBandwidthGbPerSec);
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else {
printf("%c%8s ", separator, "N/A");
}
}
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
colTotalBandwidth[numGpus] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus],
separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
printf("\n");
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
void HealthCheckPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
// Check for supported platforms
#if defined(__NVCC__)
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
return;
#endif
bool hasFail = false;
// Force use of single stream
ev.useSingleStream = 1;
TransferBench::TestResults results;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numGpuDevices != 8) {
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
exit(1);
}
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
{
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
exit(1);
}
}
// Pass limits
double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
double a2aLimit = getenv("LIMIT_A2A") ? atof(getenv("LIMIT_A2A")) : (int)(45 * 0.95);
// Run CPU to GPU
// Run unidirectional read from CPU to GPU
printf("Testing unidirectional reads from CPU ");
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId};
t.numBytes = 64*1024*1024;
t.srcs = {{MEM_CPU, memIndex}};
t.dsts = {};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run unidirectional write from GPU to CPU
printf("Testing unidirectional writes to CPU ");
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId};
t.numBytes = 64*1024*1024;
t.srcs = {};
t.dsts = {{MEM_CPU, memIndex}};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run bidirectional tests
printf("Testing bidirectional reads + writes ");
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(2);
Transfer& t0 = transfers[0];
Transfer& t1 = transfers[1];
t0.exeDevice = {EXE_GPU_GFX, gpuId};
t0.numBytes = 64*1024*1024;
t0.srcs = {{MEM_CPU, memIndex}};
t0.dsts = {};
t1.exeDevice = {EXE_GPU_GFX, gpuId};
t1.numBytes = 64*1024*1024;
t1.srcs = {};
t1.dsts = {{MEM_CPU, memIndex}};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t0.numSubExecs = cu;
t1.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
results.tfrResults[1].avgBandwidthGbPerSec);
bestResult = std::max(bestResult, sum);
if (sum >= bdirLimit) {
passed = true;
break;
}
} else {
PrintErrors(results.errResults);
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
}
}
}
// Run XGMI tests:
printf("Testing all-to-all XGMI copies "); fflush(stdout);
{
// Force GFX unroll to 2 for MI300
ev.gfxUnroll = 2;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<Transfer> transfers;
for (int i = 0; i < numGpuDevices; i++) {
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue;
Transfer t;
t.numBytes = 64*1024*1024;
t.numSubExecs = 8;
t.exeDevice = {EXE_GPU_GFX, i};
t.srcs = {{MEM_GPU_FINE, i}};
t.dsts = {{MEM_GPU_FINE, j}};
transfers.push_back(t);
}
}
std::vector<std::pair<std::pair<int,int>, double>> fails;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
int transferIdx = 0;
for (int i = 0; i < numGpuDevices; i++) {
printf("."); fflush(stdout);
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue;
double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
if (bw < a2aLimit) {
fails.push_back(std::make_pair(std::make_pair(i,j), bw));
}
transferIdx++;
}
}
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
}
}
}
exit(hasFail ? 1 : 0);
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
void OneToAllPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numDetectedGpus < 2) {
printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n");
exit(1);
}
// Collect env vars for this preset
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
int exeIndex = EnvVars::GetEnvVar("EXE_INDEX", 0);
int sweepDir = EnvVars::GetEnvVar("SWEEP_DIR", 0);
std::string sweepDst = EnvVars::GetEnvVar("SWEEP_DST", "G");
std::string sweepExe = EnvVars::GetEnvVar("SWEEP_EXE", "G");
std::string sweepSrc = EnvVars::GetEnvVar("SWEEP_SRC", "G");
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1);
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", numGpuDevices);
// Display environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[One-To-All Related]\n");
ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
ev.Print("NUM_GPU_SE", numSubExecs, "Using %d subExecutors/CUs per Transfer", numSubExecs);
ev.Print("EXE_INDEX", exeIndex, "Executing on GPU %d", exeIndex);
ev.Print("SWEEP_DIR", sweepDir, "Direction of transfer");
ev.Print("SWEEP_DST", sweepDst.c_str(), "DST memory types to sweep");
ev.Print("SWEEP_EXE", sweepExe.c_str(), "Executor type to use");
ev.Print("SWEEP_MAX", sweepMax, "Maximum number of peers");
ev.Print("SWEEP_MIN", sweepMin, "Minimum number of peers");
ev.Print("SWEEP_SRC", sweepSrc.c_str(), "SRC memory types to sweep");
printf("\n");
}
// Perform validation
for (auto ch : sweepExe) {
if (ch != 'G' && ch != 'D') {
printf("[ERROR] Unrecognized executor type '%c' specified\n", ch);
exit(1);
}
}
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
char const sep = (ev.outputToCsv ? ',' : ' ');
for (char src : sweepSrc) for (char exe : sweepExe) for (char dst : sweepDst) {
// Skip invalid configurations
if ((exe == 'D' && (src == 'N' || dst == 'N')) || (src == 'N' && dst == 'N')) continue;
printf("Executing (%c%s -> %c%d -> %c%s)\n",
src, src == 'N' ? "" : (sweepDir == 0 ? std::to_string(exeIndex).c_str() : std::string("*").c_str()),
exe, exeIndex,
dst, dst == 'N' ? "" : sweepDir == 0 ? std::string("*").c_str() : std::to_string(exeIndex).c_str());
for (int i = 0; i < numGpuDevices; i++) {
if (i == exeIndex) continue;
printf(" GPU %-3d %c", i, sep);
}
printf("\n");
if (!ev.outputToCsv) {
for (int i = 0; i < numGpuDevices-1; i++)
printf("-------------");
printf("\n");
}
for (int p = sweepMin; p <= sweepMax; p++) {
for (int bitmask = 0; bitmask < (1<<numGpuDevices); bitmask++) {
if (bitmask & (1<<exeIndex) || __builtin_popcount(bitmask) != p) continue;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpuDevices; i++) {
if (bitmask & (1<<i)) {
Transfer t;
CheckForError(TransferBench::CharToExeType(exe, t.exeDevice.exeType));
t.exeDevice.exeIndex = exeIndex;
t.exeSubIndex = -1;
t.numSubExecs = numSubExecs;
t.numBytes = numBytesPerTransfer;
if (src == 'N') {
t.srcs.clear();
} else {
t.srcs.resize(1);
CheckForError(TransferBench::CharToMemType(src, t.srcs[0].memType));
t.srcs[0].memIndex = sweepDir == 0 ? exeIndex : i;
}
if (dst == 'N') {
t.dsts.clear();
} else {
t.dsts.resize(1);
CheckForError(TransferBench::CharToMemType(dst, t.dsts[0].memType));
t.dsts[0].memIndex = sweepDir == 0 ? i : exeIndex;
}
transfers.push_back(t);
}
}
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
int counter = 0;
for (int i = 0; i < numGpuDevices; i++) {
if (bitmask & (1<<i))
printf(" %8.3f %c", results.tfrResults[counter++].avgBandwidthGbPerSec, sep);
else if (i != exeIndex)
printf(" %c", sep);
}
printf(" %d %d", p, numSubExecs);
for (auto i = 0; i < transfers.size(); i++) {
printf(" (%s %c%d %s)",
MemDevicesToStr(transfers[i].srcs).c_str(),
ExeTypeStr[transfers[i].exeDevice.exeType], transfers[i].exeDevice.exeIndex,
MemDevicesToStr(transfers[i].dsts).c_str());
}
printf("\n");
}
}
}
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
void PeerToPeerPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int useDmaCopy = EnvVars::GetEnvVar("USE_GPU_DMA", 0);
int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
int numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE", 4);
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE", useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}));
int p2pMode = EnvVars::GetEnvVar("P2P_MODE", 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
// Display environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
int outputToCsv = ev.outputToCsv;
if (!outputToCsv) printf("[P2P Related]\n");
ev.Print("NUM_CPU_DEVICES", numCpuDevices, "Using %d CPUs", numCpuDevices);
ev.Print("NUM_CPU_SE", numCpuSubExecs, "Using %d CPU threads per Transfer", numCpuSubExecs);
ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
ev.Print("NUM_GPU_SE", numGpuSubExecs, "Using %d GPU subexecutors/CUs per Transfer", numGpuSubExecs);
ev.Print("P2P_MODE", p2pMode, "Running %s transfers", p2pMode == 0 ? "Uni + Bi" :
p2pMode == 1 ? "Unidirectional"
: "Bidirectional");
ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
ev.Print("USE_GPU_DMA", useDmaCopy, "Using GPU-%s as GPU executor", useDmaCopy ? "DMA" : "GFX");
ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
printf("\n");
}
char const separator = ev.outputToCsv ? ',' : ' ';
printf("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer);
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
// Collect the number of available CPUs/GPUs on this machine
int const numDevices = numCpuDevices + numGpuDevices;
// Perform unidirectional / bidirectional
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) {
if (p2pMode == 1 && isBidirectional == 1 ||
p2pMode == 2 && isBidirectional == 0) continue;
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
useRemoteRead ? "Remote" : "Local",
useRemoteRead ? "Local" : "Remote",
useDmaCopy ? "DMA" : "GFX");
// Print header
if (isBidirectional) {
printf("%12s", "SRC\\DST");
} else {
if (useRemoteRead)
printf("%12s", "SRC\\EXE+DST");
else
printf("%12s", "SRC+EXE\\DST");
}
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numCpuDevices; i++) {
printf("%7s %02d", "CPU", i);
if (ev.outputToCsv) printf(",");
}
if (numCpuDevices > 0) printf(" ");
for (int i = 0; i < numGpuDevices; i++) {
printf("%7s %02d", "GPU", i);
if (ev.outputToCsv) printf(",");
}
printf("\n");
double avgBwSum[2][2] = {};
int avgCount[2][2] = {};
ExeType const gpuExeType = useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
// Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++) {
MemType const srcType = (src < numCpuDevices ? MEM_CPU : MEM_GPU);
int const srcIndex = (srcType == MEM_CPU ? src : src - numCpuDevices);
MemType const srcTypeActual = ((useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
(useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
srcType);
std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> stdDev(isBidirectional + 1);
if (src == numCpuDevices && src != 0) printf("\n");
for (int dst = 0; dst < numDevices; dst++) {
MemType const dstType = (dst < numCpuDevices ? MEM_CPU : MEM_GPU);
int const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpuDevices);
MemType const dstTypeActual = ((useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
(useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
dstType);
// Prepare Transfers
std::vector<Transfer> transfers(isBidirectional + 1);
// SRC -> DST
transfers[0].numBytes = numBytesPerTransfer;
transfers[0].srcs.push_back({srcTypeActual, srcIndex});
transfers[0].dsts.push_back({dstTypeActual, dstIndex});
transfers[0].exeDevice = {IsGpuMemType(useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU,
(useRemoteRead ? dstIndex : srcIndex)};
transfers[0].exeSubIndex = -1;
transfers[0].numSubExecs = (transfers[0].exeDevice.exeType == gpuExeType) ? numGpuSubExecs : numCpuSubExecs;
// DST -> SRC
if (isBidirectional) {
transfers[1].numBytes = numBytesPerTransfer;
transfers[1].srcs.push_back({dstTypeActual, dstIndex});
transfers[1].dsts.push_back({srcTypeActual, srcIndex});
transfers[1].exeDevice = {IsGpuMemType(useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU,
(useRemoteRead ? srcIndex : dstIndex)};
transfers[1].exeSubIndex = -1;
transfers[1].numSubExecs = (transfers[1].exeDevice.exeType == gpuExeType) ? numGpuSubExecs : numCpuSubExecs;
}
bool skipTest = false;
// Abort if executing on NUMA node with no CPUs
for (int i = 0; i <= isBidirectional; i++) {
if (transfers[i].exeDevice.exeType == EXE_CPU &&
TransferBench::GetNumSubExecutors(transfers[i].exeDevice) == 0) {
skipTest = true;
break;
}
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if (transfers[i].exeDevice.exeType == EXE_CPU && (IsGpuMemType(srcType) || IsGpuMemType(dstType))) {
skipTest = true;
break;
}
#endif
}
if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;
if (!skipTest) {
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults)
printf("%s\n", err.errMsg.c_str());
exit(1);
}
for (int dir = 0; dir <= isBidirectional; dir++) {
double const avgBw = results.tfrResults[dir].avgBandwidthGbPerSec;
avgBandwidth[dir].push_back(avgBw);
if (!(srcType == dstType && srcIndex == dstIndex)) {
avgBwSum[srcType][dstType] += avgBw;
avgCount[srcType][dstType]++;
}
if (ev.showIterations) {
double minTime = results.tfrResults[dir].perIterMsec[0];
double maxTime = minTime;
double varSum = 0;
for (int i = 0; i < results.tfrResults[dir].perIterMsec.size(); i++) {
minTime = std::min(minTime, results.tfrResults[dir].perIterMsec[i]);
maxTime = std::max(maxTime, results.tfrResults[dir].perIterMsec[i]);
double const bw = (transfers[dir].numBytes / 1.0E9) / results.tfrResults[dir].perIterMsec[i] * 1000.0f;
double const delta = (avgBw - bw);
varSum += delta * delta;
}
double const minBw = (transfers[dir].numBytes / 1.0E9) / maxTime * 1000.0f;
double const maxBw = (transfers[dir].numBytes / 1.0E9) / minTime * 1000.0f;
double const stdev = sqrt(varSum / results.tfrResults[dir].perIterMsec.size());
minBandwidth[dir].push_back(minBw);
maxBandwidth[dir].push_back(maxBw);
stdDev[dir].push_back(stdev);
}
}
} else {
for (int dir = 0; dir <= isBidirectional; dir++) {
avgBandwidth[dir].push_back(0);
minBandwidth[dir].push_back(0);
maxBandwidth[dir].push_back(0);
stdDev[dir].push_back(-1.0);
}
}
}
for (int dir = 0; dir <= isBidirectional; dir++) {
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
if (ev.outputToCsv) printf(",");
for (int dst = 0; dst < numDevices; dst++) {
if (dst == numCpuDevices && dst != 0) printf(" ");
double const avgBw = avgBandwidth[dir][dst];
if (avgBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", avgBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
if (ev.showIterations) {
// minBw
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++) {
double const minBw = minBandwidth[dir][i];
if (i == numCpuDevices && i != 0) printf(" ");
if (minBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", minBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
// maxBw
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++) {
double const maxBw = maxBandwidth[dir][i];
if (i == numCpuDevices && i != 0) printf(" ");
if (maxBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", maxBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
// stddev
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++) {
double const sd = stdDev[dir][i];
if (i == numCpuDevices && i != 0) printf(" ");
if (sd == -1.0)
printf("%10s", "N/A");
else
printf("%10.2f", sd);
if (ev.outputToCsv) printf(",");
}
printf("\n");
}
fflush(stdout);
}
if (isBidirectional) {
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
if (ev.outputToCsv) printf(",");
for (int dst = 0; dst < numDevices; dst++) {
double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
if (dst == numCpuDevices && dst != 0) printf(" ");
if (sumBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", sumBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
if (src < numDevices - 1) printf("\n");
}
}
if (!ev.outputToCsv) {
printf(" ");
for (int srcType : {MEM_CPU, MEM_GPU})
for (int dstType : {MEM_CPU, MEM_GPU})
printf(" %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
printf("\n");
printf("Averages (During %s):", isBidirectional ? " BiDir" : "UniDir");
for (int srcType : {MEM_CPU, MEM_GPU})
for (int dstType : {MEM_CPU, MEM_GPU}) {
if (avgCount[srcType][dstType])
printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
else
printf("%10s", "N/A");
}
printf("\n\n");
}
}
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
// Included after EnvVars and Executors
#include "AllToAll.hpp"
#include "HealthCheck.hpp"
#include "OneToAll.hpp"
#include "PeerToPeer.hpp"
#include "Scaling.hpp"
#include "Schmoo.hpp"
#include "Sweep.hpp"
#include <map>
typedef void (*PresetFunc)(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName);
std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
{
{"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}},
{"healthcheck", {HealthCheckPreset,"Simple bandwidth health check (MI300X series only)"}},
{"one2all", {OneToAllPreset, "Test all subsets of parallel transfers from one GPU to all others"}},
{"p2p" , {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}},
{"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}},
{"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}},
{"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}},
{"sweep", {SweepPreset, "Ordered sweep through sets of Transfers"}},
};
void DisplayPresets()
{
printf("\nAvailable Preset Benchmarks:\n");
printf("============================\n");
for (auto const& x : presetFuncMap)
printf(" %15s - %s\n", x.first.c_str(), x.second.second.c_str());
}
int RunPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
int const argc,
char** const argv)
{
std::string preset = (argc > 1 ? argv[1] : "");
if (presetFuncMap.count(preset)) {
(presetFuncMap[preset].first)(ev, numBytesPerTransfer, preset);
return 1;
}
return 0;
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
void ScalingPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int localIdx = EnvVars::GetEnvVar("LOCAL_IDX", 0);
int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", 32);
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
// Display environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
int outputToCsv = ev.outputToCsv;
if (!outputToCsv) printf("[Schmoo Related]\n");
ev.Print("LOCAL_IDX", localIdx, "Local GPU index");
ev.Print("SWEEP_MAX", sweepMax, "Max number of subExecutors to use");
ev.Print("SWEEP_MIN", sweepMin, "Min number of subExecutors to use");
printf("\n");
}
// Validate env vars
if (localIdx >= numDetectedGpus) {
printf("[ERROR] Cannot execute scaling test with local GPU device %d\n", localIdx);
exit(1);
}
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
char separator = (ev.outputToCsv ? ',' : ' ');
int numDevices = numCpuDevices + numGpuDevices;
printf("GPU-GFX Scaling benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes from GPU %d to other devices\n", numBytesPerTransfer, localIdx);
printf("- All numbers reported as GB/sec\n\n");
printf("NumCUs");
for (int i = 0; i < numDevices; i++)
printf("%c %s%02d ", separator, i < numCpuDevices ? "CPU" : "GPU", i < numCpuDevices ? i : i - numCpuDevices);
printf("\n");
std::vector<std::pair<double, int>> bestResult(numDevices);
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, localIdx};
t.exeSubIndex = -1;
t.numBytes = numBytesPerTransfer;
t.srcs = {{MEM_GPU, localIdx}};
for (int numSubExec = sweepMin; numSubExec <= sweepMax; numSubExec++) {
t.numSubExecs = numSubExec;
printf("%4d ", numSubExec);
for (int i = 0; i < numDevices; i++) {
t.dsts = {{i < numCpuDevices ? MEM_CPU : MEM_GPU,
i < numCpuDevices ? i : i - numCpuDevices}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double bw = results.tfrResults[0].avgBandwidthGbPerSec;
printf("%c%7.2f ", separator, bw);
if (bw > bestResult[i].first) {
bestResult[i].first = bw;
bestResult[i].second = numSubExec;
}
}
printf("\n");
}
printf(" Best ");
for (int i = 0; i < numDevices; i++)
printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
printf("\n");
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
void SchmooPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numDetectedGpus < 2) {
printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
exit(1);
}
// Collect env vars for this preset
int localIdx = EnvVars::GetEnvVar("LOCAL_IDX", 0);
int remoteIdx = EnvVars::GetEnvVar("REMOTE_IDX", 1);
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX", 32);
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
// Display environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
int outputToCsv = ev.outputToCsv;
if (!outputToCsv) printf("[Schmoo Related]\n");
ev.Print("LOCAL_IDX", localIdx, "Local GPU index");
ev.Print("REMOTE_IDX", remoteIdx, "Remote GPU index");
ev.Print("SWEEP_MAX", sweepMax, "Max number of subExecutors to use");
ev.Print("SWEEP_MIN", sweepMin, "Min number of subExecutors to use");
ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
printf("\n");
}
// Validate env vars
if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) {
printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
exit(1);
}
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
char memChar = useFineGrain ? 'F' : 'G';
printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
printf(" | Local Read | Local Write | Local Copy | Remote Read | Remote Write| Remote Copy |\n");
printf(" #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
memChar, localIdx, localIdx,
localIdx, memChar, localIdx,
memChar, localIdx, localIdx, memChar, localIdx,
memChar, remoteIdx, localIdx,
localIdx, memChar, remoteIdx,
memChar, localIdx, localIdx, memChar, remoteIdx);
printf("|------|-------------|-------------|-------------|-------------|-------------|-------------|\n");
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, localIdx};
t.exeSubIndex = -1;
t.numBytes = numBytesPerTransfer;
MemType memType = (useFineGrain ? MEM_GPU_FINE : MEM_GPU);
for (int numCUs = sweepMin; numCUs <= sweepMax; numCUs++) {
t.numSubExecs = numCUs;
// Local Read
t.srcs = {{memType, localIdx}};
t.dsts = {};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const localRead = results.tfrResults[0].avgBandwidthGbPerSec;
// Local Write
t.srcs = {};
t.dsts = {{memType, localIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec;
// Local Copy
t.srcs = {{memType, localIdx}};
t.dsts = {{memType, localIdx}};
t.srcs = {};
t.dsts = {{memType, localIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Read
t.srcs = {{memType, remoteIdx}};
t.dsts = {};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Write
t.srcs = {};
t.dsts = {{memType, remoteIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Copy
t.srcs = {{memType, localIdx}};
t.dsts = {{memType, remoteIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
}
double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec;
printf(" %3d %11.3f %11.3f %11.3f %11.3f %11.3f %11.3f \n",
numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment