Commit 09f4f11b authored by Oliveira, Daniel's avatar Oliveira, Daniel
Browse files

impr: Library/Client build organization



Change code organization and build options

Code changes related to the following:
  * Build files
    * Options to build client, shared, and static libraries
  * Source code directories
  * Modern C++20 changes
  * Based on TB 1.6.4
  * Formatting
Signed-off-by: default avatarOliveira, Daniel <daniel.oliveira@amd.com>
parent 2d0ecaae
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include "TransferBench.hpp"
#include <vector>
static int RemappedCpuIndex(int origIdx)
{
static std::vector<int> remappingCpu;
// Build CPU remapping on first use
// Skip numa nodes that are not configured
if (remappingCpu.empty()) {
for (int node = 0; node <= numa_max_node(); node++) {
if (numa_bitmask_isbitset(numa_get_mems_allowed(), node)) {
remappingCpu.push_back(node);
}
}
}
return remappingCpu[origIdx];
}
static void PrintNicToGPUTopo([[maybe_unused]] bool outputToCsv)
{
#ifdef NIC_EXEC_ENABLED
printf(
" NIC | Device Name | Active | PCIe Bus ID | NUMA | Closest GPU(s) | GID Index | GID "
"Descriptor\n");
if (!outputToCsv) {
printf(
"-----+-------------+--------+--------------+------+----------------+-----------+------"
"-------------\n");
}
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
auto const& ibvDeviceList = GetIbvDeviceList();
for (int i = 0; i < ibvDeviceList.size(); i++) {
std::string closestGpusStr = "";
for (int j = 0; j < numGpus; j++) {
if (TransferBench::GetClosestNicToGpu(j) == i) {
if (closestGpusStr != "") { closestGpusStr += ","; }
closestGpusStr += std::to_string(j);
}
}
printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-14s | %-9s | %-20s\n",
i,
ibvDeviceList[i].name.c_str(),
ibvDeviceList[i].hasActivePort ? "Yes" : "No",
ibvDeviceList[i].busId.c_str(),
ibvDeviceList[i].numaNode,
closestGpusStr.c_str(),
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort
? std::to_string(ibvDeviceList[i].gidIndex).c_str()
: "N/A",
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort
? ibvDeviceList[i].gidDescriptor.c_str()
: "N/A");
}
printf("\n");
#endif
}
void DisplayTopology(bool outputToCsv)
{
int numCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
char sep = (outputToCsv ? ',' : '|');
if (outputToCsv) {
printf("NumCpus,%d\n", numCpus);
printf("NumGpus,%d\n", numGpus);
printf("NumNics,%d\n", numNics);
} else {
printf("\nDetected Topology:\n");
printf("==================\n");
printf(" %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1);
printf(" %d GPU device(s)\n", numGpus);
printf(" %d Supported NIC device(s)\n", numNics);
}
// Print out detected CPU topology
printf("\n %c", sep);
for (int j = 0; j < numCpus; j++) { printf("NUMA %02d%c", j, sep); }
printf(" #Cpus %c Closest GPU(s)\n", sep);
if (!outputToCsv) {
printf("------------+");
for (int j = 0; j <= numCpus; j++) { printf("-------+"); }
printf("---------------\n");
}
for (int i = 0; i < numCpus; i++) {
int nodeI = RemappedCpuIndex(i);
printf("NUMA %02d (%02d)%c", i, nodeI, sep);
for (int j = 0; j < numCpus; j++) {
int nodeJ = RemappedCpuIndex(j);
int numaDist = numa_distance(nodeI, nodeJ);
printf(" %5d %c", numaDist, sep);
}
int numCpuCores = 0;
for (int j = 0; j < numa_num_configured_cpus(); j++) {
if (numa_node_of_cpu(j) == nodeI) { numCpuCores++; }
}
printf(" %5d %c", numCpuCores, sep);
for (int j = 0; j < numGpus; j++) {
if (TransferBench::GetClosestCpuNumaToGpu(j) == nodeI) { printf(" %d", j); }
}
printf("\n");
}
printf("\n");
// Print out detected NIC topology
PrintNicToGPUTopo(outputToCsv);
// Print out detected GPU topology
#if defined(__NVCC__)
for (int i = 0; i < numGpus; i++) {
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, i));
printf(" GPU %02d | %s\n", i, prop.name);
}
// No further topology detection done for NVIDIA platforms
return;
#else
// Print headers
if (!outputToCsv) {
printf(" |");
for (int j = 0; j < numGpus; j++) {
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, j));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
printf(" %6s |", archName.c_str());
}
printf("\n");
}
printf(" %c", sep);
for (int j = 0; j < numGpus; j++) { printf(" GPU %02d %c", j, sep); }
printf(" PCIe Bus ID %c #CUs %c NUMA %c #DMA %c #XCC %c NIC\n", sep, sep, sep, sep, sep);
if (!outputToCsv) {
for (int j = 0; j <= numGpus; j++) { printf("--------+"); }
printf("--------------+------+------+------+------+------\n");
}
// Loop over each GPU device
for (int i = 0; i < numGpus; i++) {
printf(" GPU %02d %c", i, sep);
// Print off link information
for (int j = 0; j < numGpus; j++) {
if (i == j) {
printf(" N/A %c", sep);
} else {
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
printf(" %s-%d %c",
linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT"
: linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI"
: linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE"
: linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFB"
: linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI"
: "????",
hopCount,
sep);
}
}
char pciBusId[20];
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
printf(" %-11s %c %-4d %c %-4d %c %-4d %c %-4d %c %-4d\n",
pciBusId,
sep,
TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}),
sep,
TransferBench::GetClosestCpuNumaToGpu(i),
sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}),
sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}),
sep,
TransferBench::GetClosestNicToGpu(i));
}
#endif
}
/*
Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "Client.hpp"
#include "EnvVars.hpp"
#include "Presets.hpp"
#include "Topology.hpp"
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <fstream>
#include <map>
#include <string>
#include <vector>
int main(int argc, char** argv)
{
// Collect environment variables
EnvVars ev;
// Display usage instructions and detected topology
if (argc <= 1) {
if (!ev.outputToCsv) {
DisplayUsage(argv[0]);
DisplayPresets();
}
DisplayTopology(ev.outputToCsv);
exit(0);
}
// Determine number of bytes to run per Transfer
size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
if (argc > 2) {
// Adjust bytes if unit specified
char units = argv[2][strlen(argv[2]) - 1];
switch (units) {
case 'G':
case 'g': numBytesPerTransfer *= 1024;
case 'M':
case 'm': numBytesPerTransfer *= 1024;
case 'K':
case 'k': numBytesPerTransfer *= 1024;
}
}
if (numBytesPerTransfer % 4) {
printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
exit(1);
}
// Run preset benchmark if requested
if (RunPreset(ev, numBytesPerTransfer, argc, argv)) { exit(0); }
// Read input from command line or configuration file
std::vector<std::string> lines;
{
std::string line;
if (!strcmp(argv[1], "cmdline")) {
for (int i = 3; i < argc; i++) { line += std::string(argv[i]) + " "; }
lines.push_back(line);
} else {
std::ifstream cfgFile(argv[1]);
if (!cfgFile.is_open()) {
printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
exit(1);
}
while (std::getline(cfgFile, line)) { lines.push_back(line); }
cfgFile.close();
}
}
// Print environment variables and CSV header
ev.DisplayEnvVars();
if (ev.outputToCsv) {
printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
}
TransferBench::ConfigOptions cfgOptions = ev.ToConfigOptions();
TransferBench::TestResults results;
std::vector<ErrResult> errors;
// Process each line as a Test
int testNum = 0;
for (std::string const& line : lines) {
// Check if line is a comment to be echoed to output (starts with ##)
if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') { printf("%s\n", line.c_str()); }
// Parse set of parallel Transfers to execute
std::vector<Transfer> transfers;
CheckForError(TransferBench::ParseTransfers(line, transfers));
if (transfers.empty()) { continue; }
// Check for variable sub-executors Transfers
auto numVariableTransfers = std::size_t(0);
int maxVarCount = 0;
{
std::map<ExeDevice, int> varTransferCount;
for (auto const& t : transfers) {
if (t.numSubExecs == 0) {
if (t.exeDevice.exeType != EXE_GPU_GFX) {
printf(
"[ERROR] Variable number of subexecutors is only supported on GFX "
"executors\n");
exit(1);
}
numVariableTransfers++;
varTransferCount[t.exeDevice]++;
maxVarCount = max(maxVarCount, varTransferCount[t.exeDevice]);
}
}
if (numVariableTransfers > 0 && numVariableTransfers != transfers.size()) {
printf(
"[ERROR] All or none of the Transfers in the Test must use variable number of "
"Subexecutors\n");
exit(1);
}
}
// Track which transfers have already numBytes specified
std::vector<bool> bytesSpecified(transfers.size());
int hasUnspecified = false;
for (auto i = std::size_t(0); i < transfers.size(); i++) {
bytesSpecified[i] = (transfers[i].numBytes != 0);
if (transfers[i].numBytes == 0) { hasUnspecified = true; }
}
// Run the specified numbers of bytes otherwise generate a range of values
for (size_t bytes = (1 << 10); bytes <= (1 << 29); bytes *= 2) {
size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
do {
for (auto i = std::size_t(0); i < transfers.size(); i++) {
if (!bytesSpecified[i]) { transfers[i].numBytes = currBytes; }
}
if (maxVarCount == 0) {
if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
PrintResults(ev, ++testNum, transfers, results);
}
PrintErrors(results.errResults);
} else {
// Variable subexecutors - Determine how many subexecutors to sweep up to
int maxNumVarSubExec = ev.maxNumVarSubExec;
if (maxNumVarSubExec == 0) {
maxNumVarSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}) /
maxVarCount;
}
TransferBench::TestResults bestResults;
std::vector<Transfer> bestTransfers;
for (int numSubExecs = ev.minNumVarSubExec; numSubExecs <= maxNumVarSubExec;
numSubExecs++) {
std::vector<Transfer> tempTransfers = transfers;
for (auto& t : tempTransfers) {
if (t.numSubExecs == 0) { t.numSubExecs = numSubExecs; }
}
TransferBench::TestResults tempResults;
if (!TransferBench::RunTransfers(cfgOptions, tempTransfers, tempResults)) {
PrintErrors(tempResults.errResults);
} else {
if (tempResults.avgTotalBandwidthGbPerSec >
bestResults.avgTotalBandwidthGbPerSec) {
bestResults = tempResults;
bestTransfers = tempTransfers;
}
}
}
PrintResults(ev, ++testNum, bestTransfers, bestResults);
PrintErrors(bestResults.errResults);
}
if (numBytesPerTransfer != 0 || !hasUnspecified) { break; }
currBytes += deltaBytes;
} while (currBytes < bytes * 2);
if (numBytesPerTransfer != 0 || !hasUnspecified) { break; }
}
}
}
void DisplayUsage(char const* cmdName)
{
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
printf("TransferBench v%s.(%s)[%s]\n",
TransferBench::GetTransferBenchVersion().c_str(),
GetClientVersion().c_str(),
nicSupport.c_str());
printf("========================================\n");
if (numa_available() == -1) {
printf(
"[ERROR] NUMA library not supported. Check to see if libnuma has been installed on "
"this system\n");
exit(1);
}
printf("Usage: %s config <N>\n", cmdName);
printf(" config: Either:\n");
printf(
" - Filename of configFile containing Transfers to execute (see example.cfg for "
"format)\n");
printf(" - Name of preset config:\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
printf("\n");
EnvVars::DisplayUsage();
}
std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices)
{
if (memDevices.empty()) { return "N"; }
std::stringstream ss;
for (auto const& m : memDevices) { ss << TransferBench::MemTypeStr[m.memType] << m.memIndex; }
return ss.str();
}
void PrintResults(EnvVars const& ev,
int const testNum,
std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results)
{
char sep = ev.outputToCsv ? ',' : '|';
size_t numTimedIterations = results.numTimedIterations;
if (!ev.outputToCsv) { printf("Test %d:\n", testNum); }
// Loop over each executor
for (auto exeInfoPair : results.exeResults) {
ExeDevice const& exeDevice = exeInfoPair.first;
ExeResult const& exeResult = exeInfoPair.second;
ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex;
printf(
" Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
ExeTypeName[exeType],
exeIndex,
sep,
exeResult.avgBandwidthGbPerSec,
sep,
exeResult.avgDurationMsec,
sep,
exeResult.numBytes,
sep,
exeResult.sumBandwidthGbPerSec);
// Loop over each executor
for (int idx : exeResult.transferIdx) {
Transfer const& t = transfers[idx];
TransferResult const& r = results.tfrResults[idx];
char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1) { sprintf(exeSubIndexStr, ".%d", t.exeSubIndex); }
printf(
" Transfer %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> "
"%c%03d%s:%03d -> %s\n",
idx,
sep,
r.avgBandwidthGbPerSec,
sep,
r.avgDurationMsec,
sep,
r.numBytes,
sep,
MemDevicesToStr(t.srcs).c_str(),
TransferBench::ExeTypeStr[t.exeDevice.exeType],
t.exeDevice.exeIndex,
exeSubIndexStr,
t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
// Show per-iteration timing information
if (ev.showIterations) {
// Check that per-iteration information exists
if (r.perIterMsec.size() != numTimedIterations) {
printf(
"[ERROR] Per iteration timing data unavailable: Expected %lu data points, "
"but have %lu\n",
numTimedIterations,
r.perIterMsec.size());
exit(1);
}
// Compute standard deviation and track iterations by speed
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (auto i = std::size_t(0); i < numTimedIterations; i++) {
times.insert(std::make_pair(r.perIterMsec[i], i + 1));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
// Loop over iterations (fastest to slowest)
for (auto& time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %8.3f GB/s %c %8.3f ms %c",
time.second,
sep,
iterBandwidthGbs,
sep,
iterDurationMsec,
sep);
std::set<int> usedXccs;
if ((time.second - 1) < static_cast<std::int32_t>(r.perIterCUs.size())) {
printf(" CUs:");
for (auto x : r.perIterCUs[time.second - 1]) {
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs) { printf(" %02d", x); }
printf("\n");
}
printf(" StandardDev %c %8.3f GB/s %c %8.3f ms %c\n",
sep,
stdDevBw,
sep,
stdDevTime,
sep);
}
}
}
printf(" Aggregate (CPU) %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
sep,
results.avgTotalBandwidthGbPerSec,
sep,
results.avgTotalDurationMsec,
sep,
results.totalBytesTransferred,
sep,
results.overheadMsec);
}
void CheckForError(ErrResult const& error)
{
switch (error.errType) {
case ERR_NONE: return;
case ERR_WARN: printf("[WARN] %s\n", error.errMsg.c_str()); return;
case ERR_FATAL: printf("[ERROR] %s\n", error.errMsg.c_str()); exit(1);
default: break;
}
}
void PrintErrors(std::vector<ErrResult> const& errors)
{
bool isFatal = false;
for (auto const& err : errors) {
printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
isFatal |= (err.errType == ERR_FATAL);
}
if (isFatal) { exit(1); }
}
auto GetClientVersion() -> const std::string
{
static constexpr auto TB_UNKNOWN_CLIENT_VERSION = std::string_view("Unknown");
auto tb_client_version = std::string(TRANSFERBENCH_CLIENT_VERSION);
if (tb_client_version.empty()) { tb_client_version = std::string(TB_UNKNOWN_CLIENT_VERSION); }
return tb_client_version;
}
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#
#
# Required Includes
include(FetchContent)
#
# Note: All functions definitions here
function(get_rocm_install_path rocm_install_base_path)
if(NOT DEFINED ROCM_INSTALL_PATH_FOR_BUILD)
message(STATUS ">> Checking ROCm install path settings...")
set(TMP_ROCM_INSTALL_PATH "")
if(DEFINED ENV{ROCM_PATH} OR DEFINED ROCM_PATH)
if(DEFINED ENV{ROCM_PATH})
message(STATUS " >> Environment variable ROCM_PATH: '$ENV{ROCM_PATH}'")
set(TMP_ROCM_INSTALL_PATH "$ENV{ROCM_PATH}")
endif()
if(DEFINED ROCM_PATH)
message(STATUS " >> CMake variable ROCM_PATH: '${ROCM_PATH}'")
set(TMP_ROCM_INSTALL_PATH "${ROCM_PATH}")
endif()
elseif(DEFINED ENV{ROCM_INSTALL_PATH} OR DEFINED ROCM_INSTALL_PATH)
if(DEFINED ENV{ROCM_INSTALL_PATH})
message(STATUS " >> Environment variable ROCM_INSTALL_PATH: '$ENV{ROCM_INSTALL_PATH}'")
set(TMP_ROCM_INSTALL_PATH "$ENV{ROCM_PATH}")
endif()
if(DEFINED ROCM_INSTALL_PATH)
message(STATUS " >> CMake variable ROCM_INSTALL_PATH: '${ROCM_INSTALL_PATH}'")
endif()
else()
set(TMP_ROCM_INSTALL_PATH "/opt/rocm")
message(STATUS " >> Using default ROCm install path: '${TMP_ROCM_INSTALL_PATH}'")
endif()
set(ROCM_INSTALL_PATH_FOR_BUILD "${TMP_ROCM_INSTALL_PATH}" CACHE STRING "ROCm install directory for build" FORCE)
set(ROCM_INSTALL_PATH_FOR_BUILD "${ROCM_INSTALL_PATH_FOR_BUILD}" PARENT_SCOPE)
else()
set(${rocm_install_base_path} "${ROCM_INSTALL_PATH_FOR_BUILD}" PARENT_SCOPE)
endif()
endfunction()
function(setup_build_version version_num version_text)
set(TARGET_VERSION_FILE "${CMAKE_CURRENT_SOURCE_DIR}/VERSION")
if(NOT EXISTS "${TARGET_VERSION_FILE}")
message(FATAL_ERROR " >> VERSION file not found at: '${TARGET_VERSION_FILE}' ...")
endif()
set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${TARGET_VERSION_FILE})
file(READ "${TARGET_VERSION_FILE}" file_version)
string(STRIP ${file_version} file_version)
string(REPLACE ".wip" "" file_version_text ${file_version})
string(REPLACE ".WIP" "" file_version_text ${file_version})
set(${version_num} ${file_version} PARENT_SCOPE)
set(${version_text} ${file_version_text} PARENT_SCOPE)
endfunction()
function(setup_rocm_requirements)
message(STATUS ">> Checking ROCm environment...")
get_rocm_install_path(ROCM_BASE_PATH)
#
#find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_BASE_PATH})
find_package(ROCmCMakeBuildTools REQUIRED PATHS ${ROCM_BASE_PATH})
find_package(ROCM REQUIRED PATHS ${ROCM_BASE_PATH})
find_package(HSA-RUNTIME64 REQUIRED PATHS ${ROCM_BASE_PATH})
set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "ROCM_WARN_TOOLCHAIN warnings disabled: 'OFF'")
set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "ROCMCHECKS_WARN_TOOLCHAIN_VAR warnings disabled: 'OFF'")
endfunction()
function(add_include_from_library target_name library_name)
get_target_property(LIBRARY_INCLUDE_DIRECTORIES ${library_name} INTERFACE_INCLUDE_DIRECTORIES)
target_include_directories(${target_name} PRIVATE ${LIBRARY_INCLUDE_DIRECTORIES})
endfunction()
function(add_source_definitions target_name definition_text)
set_property(SOURCE ${target_name} APPEND PROPERTY COMPILE_DEFINITIONS "${definition_text}")
endfunction()
function(build_transferbench_engine)
include(ROCMInstallTargets)
include(ROCMCreatePackage)
endfunction()
function(has_build_debug_mode debug_mode_result)
if(NOT DEFINED IS_BUILD_DEBUG_MSG_MODE_ENABLED)
if(AMD_APP_DEBUG_BUILD_INFO OR
(DEFINED ENV{AMD_APP_DEBUG_BUILD_INFO} AND
("$ENV{AMD_APP_DEBUG_BUILD_INFO}" STREQUAL "ON") OR
("$ENV{AMD_APP_DEBUG_BUILD_INFO}" STREQUAL "1")) OR
(DEFINED BUILD_DEBUG_MSG_MODE AND (BUILD_DEBUG_MSG_MODE STREQUAL "ON")))
set(IS_BUILD_DEBUG_MSG_MODE_ENABLED BOOL TRUE)
set(IS_BUILD_DEBUG_MSG_MODE_ENABLED BOOL TRUE PARENT_SCOPE)
set(${debug_mode_result} BOOL TRUE PARENT_SCOPE)
else()
set(IS_BUILD_DEBUG_MSG_MODE_ENABLED BOOL FALSE)
set(IS_BUILD_DEBUG_MSG_MODE_ENABLED BOOL FALSE PARENT_SCOPE)
set(${debug_mode_result} BOOL FALSE PARENT_SCOPE)
endif()
else()
if(IS_BUILD_DEBUG_MSG_MODE_ENABLED)
set(${debug_mode_result} BOOL TRUE PARENT_SCOPE)
else()
set(${debug_mode_result} BOOL FALSE PARENT_SCOPE)
endif()
endif()
endfunction()
function(get_target target_name target_type)
get_target_property(IMPORTED_TARGET ${target_name} IMPORTED)
if(IMPORTED_TARGET)
set(${target_type} INTERFACE PARENT_SCOPE)
else()
set(${target_type} PRIVATE PARENT_SCOPE)
endif()
endfunction()
function(add_c_flag)
if (ARGC EQUAL 1)
add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARGV0}>)
elseif(ARGC EQUAL 2)
get_target(${ARGV1} TYPE)
target_compile_options(${ARGV1} ${TYPE} $<$<COMPILE_LANGUAGE:C>:${ARGV0}>)
endif()
endfunction()
function(add_cxx_flag)
if (ARGC EQUAL 1)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARGV0}>)
elseif(ARGC EQUAL 2)
get_target(${ARGV1} TYPE)
target_compile_options(${ARGV1} ${TYPE} $<$<COMPILE_LANGUAGE:CXX>:${ARGV0}>)
endif()
endfunction()
function(add_linker_flag)
if (ARGC EQUAL 1)
add_link_options(${ARGV0})
elseif(ARGC EQUAL 2)
get_target(${ARGV1} TYPE)
target_link_options(${ARGV1} ${TYPE} ${ARGV0})
endif()
endfunction()
function(add_c_cxx_flag)
add_c_flag(${ARGV0} ${ARGV1})
add_cxx_flag(${ARGV0} ${ARGV1})
endfunction()
function(add_common_flag)
add_c_flag(${ARGV0} ${ARGV1})
add_cxx_flag(${ARGV0} ${ARGV1})
endfunction()
function(add_cppcheck target_name)
if(NOT TRANSFERBENCH_ENABLE_CPPCHECK_WARNINGS)
return()
endif()
find_program(CPPCHECK_EXECUTABLE NAMES cppcheck REQUIRED)
if(NOT CPPCHECK_EXECUTABLE)
message(WARNING ">> Skipping 'cppcheck' target for: ${target_name}. Could not find 'Cppcheck' ...")
return()
endif()
set(CPPCHECK_CONFIG_FILE "cppcheck_static_supp.config")
set(CPPCHECK_REPORT_FILE "cppcheck_report.txt")
set(TARGET_BUILD_DIRECTORY $<TARGET_FILE_DIR:${target_name}>)
set(CPPCHECK_OPTION_LIST
--enable=all
--quiet
--std=c++${CMAKE_CXX_STANDARD}
--inline-suppr
--check-level=exhaustive
--error-exitcode=10
--suppressions-list=${CMAKE_SOURCE_DIR}/dist/${CPPCHECK_CONFIG_FILE}
--checkers-report=${TARGET_BUILD_DIRECTORY}/${CPPCHECK_REPORT_FILE}
)
set_target_properties(${target_name}
PROPERTIES
CXX_CPPCHECK "${CPPCHECK_EXECUTABLE};${CPPCHECK_OPTION_LIST}"
)
has_build_debug_mode(HAS_DEBUG_MODE_ENABLED)
if(HAS_DEBUG_MODE_ENABLED)
developer_status_message("DEVEL" ">> CppCheck settings for: '${target_name}' ...")
developer_status_message("DEVEL" " >> Target Build Directory: '${TARGET_BUILD_DIRECTORY}' ")
developer_status_message("DEVEL" " >> Cpp std: 'c++${CMAKE_CXX_STANDARD}' ")
developer_status_message("DEVEL" " >> suppressions-list: '${CMAKE_SOURCE_DIR}/dist/${CPPCHECK_CONFIG_FILE}' ")
developer_status_message("DEVEL" " >> checkers-report: ${TARGET_BUILD_DIRECTORY}/${CPPCHECK_REPORT_FILE}' ")
developer_status_message("DEVEL" " >> CppCheck located at: '${CPPCHECK_EXECUTABLE}' ")
developer_status_message("DEVEL" " >> CppCheck options: '${CPPCHECK_OPTION_LIST}' ")
endif()
endfunction()
function(check_compiler_requirements component_name)
## We need to make sure we have C++ enabled, or we get errors like:
## 'check_compiler_flag: CXX: needs to be enabled before use'
get_property(project_enabled_languages GLOBAL PROPERTY ENABLED_LANGUAGES)
if(NOT project_enabled_languages OR NOT "CXX" IN_LIST project_enabled_languages)
enable_language(CXX)
endif()
## Check if we are able to use Lightning (Clang++) as default compiler
## Note: If this condition is met, we used rocm_clang_toolchain.cmake and the toolchain was already
## checked and set up.
if(NOT IS_LIGHTNING_CLANG_DEFAULT_COMPILER AND NOT ROCM_CLANG_TOOLCHAIN_USED)
message(FATAL_ERROR ">> ROCm 'Lightning Clang++' Toolchain: was not set (rocm_clang_toolchain.cmake) ...")
endif()
## Check if the compiler is compatible with the C++ standard.
## Note: Minimum required is ${CMAKE_CXX_STANDARD} = 20, but we check for 23, 20, and 17.
if(NOT DEFINED IS_COMPILER_SUPPORTS_CXX23_STANDARD OR NOT DEFINED IS_COMPILER_SUPPORTS_CXX20_STANDARD OR NOT DEFINED IS_COMPILER_SUPPORTS_CXX17_STANDARD)
include(CheckCXXCompilerFlag)
message(STATUS ">> Checking Compiler: '${CMAKE_CXX_COMPILER}' for C++ standard ...")
## Just to have independent checks/variables
set(CHECK_CMAKE_CXX_STANDARD 23)
if(NOT DEFINED IS_COMPILER_SUPPORTS_CXX23_STANDARD)
set(IS_COMPILER_SUPPORTS_CHECK "IS_COMPILER_SUPPORTS_CXX${CHECK_CMAKE_CXX_STANDARD}_STANDARD")
check_cxx_compiler_flag("-std=c++${CHECK_CMAKE_CXX_STANDARD}" COMPILER_SUPPORTS_CXX23_STANDARD)
if(COMPILER_SUPPORTS_CXX23_STANDARD)
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE)
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE PARENT_SCOPE)
developer_status_message("DEVEL" " >> Compiler: ${CMAKE_CXX_COMPILER} supports CXX Standard '${CHECK_CMAKE_CXX_STANDARD}' ...")
else()
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE)
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE PARENT_SCOPE)
endif()
endif()
set(CHECK_CMAKE_CXX_STANDARD 20)
if(NOT DEFINED IS_COMPILER_SUPPORTS_CXX20_STANDARD)
set(IS_COMPILER_SUPPORTS_CHECK "IS_COMPILER_SUPPORTS_CXX${CHECK_CMAKE_CXX_STANDARD}_STANDARD")
check_cxx_compiler_flag("-std=c++${CHECK_CMAKE_CXX_STANDARD}" COMPILER_SUPPORTS_CXX20_STANDARD)
if(COMPILER_SUPPORTS_CXX20_STANDARD)
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE)
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE PARENT_SCOPE)
developer_status_message("DEVEL" " >> Compiler: ${CMAKE_CXX_COMPILER} supports CXX Standard '${CHECK_CMAKE_CXX_STANDARD}' ...")
else()
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE)
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE PARENT_SCOPE)
endif()
endif()
set(CHECK_CMAKE_CXX_STANDARD 17)
if(NOT DEFINED IS_COMPILER_SUPPORTS_CXX17_STANDARD)
set(IS_COMPILER_SUPPORTS_CHECK "IS_COMPILER_SUPPORTS_CXX${CHECK_CMAKE_CXX_STANDARD}_STANDARD")
check_cxx_compiler_flag("-std=c++${CHECK_CMAKE_CXX_STANDARD}" COMPILER_SUPPORTS_CXX17_STANDARD)
if(COMPILER_SUPPORTS_CXX17_STANDARD)
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE)
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL TRUE PARENT_SCOPE)
developer_status_message("DEVEL" " >> Compiler: ${CMAKE_CXX_COMPILER} supports CXX Standard '${CHECK_CMAKE_CXX_STANDARD}' ...")
else()
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE)
set(${IS_COMPILER_SUPPORTS_CHECK} BOOL FALSE PARENT_SCOPE)
endif()
endif()
endif()
## Does it support the project C++ standard, ${CMAKE_CXX_STANDARD} = 20?
set(IS_COMPILER_SUPPORTS_MIN_STANDARD "${IS_COMPILER_SUPPORTS_CXX${CMAKE_CXX_STANDARD}_STANDARD}")
if(NOT IS_COMPILER_SUPPORTS_MIN_STANDARD)
message(FATAL_ERROR ">> Compiler: '${CMAKE_CXX_COMPILER}' v'${CMAKE_CXX_COMPILER_VERSION}' doesn't support CXX Standard '${CMAKE_CXX_STANDARD}'! \n"
" >> Project: '${${component_name}}' can't be built ...")
else()
message(STATUS ">> Compiler: '${CMAKE_CXX_COMPILER}' v'${CMAKE_CXX_COMPILER_VERSION}' supports the required CXX Standard '${CMAKE_CXX_STANDARD}' ...")
endif()
endfunction()
#
# Note: All macro definitions here
macro(set_variable_in_parent variable value)
get_directory_property(has_parent PARENT_DIRECTORY)
if(has_parent)
set(${variable} "${value}" PARENT_SCOPE)
else()
set(${variable} "${value}")
endif()
endmacro()
macro(setup_cmake target_name target_version)
message(STATUS ">> Building ${${target_name}} v${${target_version}} ...")
# If building shared libraries or linking static libraries into shared ones
if(TRANSFERBENCH_ENGINE_SHARED)
set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL "Set position independent code for all targets ..." FORCE)
endif()
message(STATUS ">> Configuring CMake to use the following build tools...")
check_compiler_requirements(${target_name})
#
find_program(CCACHE_PATH ccache)
find_program(NINJA_PATH ninja)
find_program(LD_LLD_PATH ld.lld)
find_program(LD_MOLD_PATH ld.mold)
if(NOT IS_LIGHTNING_CLANG_DEFAULT_COMPILER)
if(CCACHE_PATH)
set(CMAKE_C_COMPILER_LAUNCHER ${CCACHE_PATH})
set(CMAKE_CXX_COMPILER_LAUNCHER ${CCACHE_PATH})
else()
message(WARNING ">> CCache was not found!")
endif()
endif()
if(NINJA_PATH)
set(CMAKE_GENERATOR Ninja)
else()
message(WARNING ">> Ninja was not found! Using default generator.")
endif()
# Lets give priority to MOLD linker
set(AMD_PROJECT_LINKER_OPTION "")
if(LD_MOLD_PATH AND TRANSFERBENCH_LINKER_TRY_MOLD)
set(CMAKE_LINKER ${LD_MOLD_PATH} CACHE STRING "Linker to use: ${LD_MOLD_PATH}")
set(AMD_PROJECT_LINKER_OPTION "-fuse-ld=mold")
# Then LLD linker
elseif(LD_LLD_PATH)
set(CMAKE_LINKER ${LD_LLD_PATH} CACHE STRING "Linker to use: ${LD_LLD_PATH}")
set(AMD_PROJECT_LINKER_OPTION "-fuse-ld=lld")
else()
message(WARNING ">> LLD linker was not found! Using default 'Gold' linker.")
endif()
if(LD_MOLD_PATH OR LD_LLD_PATH AND AMD_PROJECT_LINKER_OPTION)
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${AMD_PROJECT_LINKER_OPTION}")
message(STATUS ">> Using linker: '${CMAKE_LINKER}' with options: '${AMD_PROJECT_LINKER_OPTION}'")
endif()
# CMake policies for the project
foreach(_policy
CMP0028 CMP0046 CMP0048 CMP0051 CMP0054
CMP0056 CMP0063 CMP0065 CMP0074 CMP0075
CMP0077 CMP0082 CMP0093 CMP0127 CMP0135)
if(POLICY ${_policy})
cmake_policy(SET ${_policy} NEW)
endif()
endforeach()
set(CMAKE_WARN_DEPRECATED OFF CACHE BOOL "Disable deprecated warning messages" FORCE)
endmacro()
macro(add_build_definitions)
if(NOT PROJECT_TARGET_VERSION)
message(FATAL_ERROR ">> Project: 'PROJECT_TARGET_VERSION' was not defined!")
endif()
message(STATUS ">> Project: '${PROJECT_NAME}' v${${PROJECT_NAME}_VERSION} ...")
set (CMAKE_RC_FLAGS "${CMAKE_RC_FLAGS} -DAMD_PROJECT_VERSION_MAJOR=${AMD_PROJECT_VERSION_MAJOR}
-DAMD_PROJECT_VERSION_MINOR=${AMD_PROJECT_VERSION_MINOR}
-DAMD_PROJECT_VERSION_MINOR=${AMD_PROJECT_VERSION_MINOR}")
if (TRANSFERBENCH_ENGINE_HEADER_ONLY)
add_compile_definitions(AMD_TRANSFERBENCH_ENGINE_HEADER_ONLY)
endif()
if (TRANSFERBENCH_ENGINE_STATIC)
add_compile_definitions(AMD_TRANSFERBENCH_ENGINE_STATIC)
endif()
if (TRANSFERBENCH_ENGINE_SHARED)
add_compile_definitions(AMD_TRANSFERBENCH_ENGINE_SHARED)
endif()
endmacro()
macro(setup_compiler_init_flags)
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag(-ftrivial-auto-var-init=zero HAS_TRIVIAL_AUTO_VAR_INIT_COMPILER)
if(NOT COMPILER_INIT_FLAG)
if(HAS_TRIVIAL_AUTO_VAR_INIT_COMPILER)
message(STATUS ">> Compiler supports -ftrivial-auto-var-init")
set(COMPILER_INIT_FLAG "-ftrivial-auto-var-init=zero" CACHE STRING "Using cache trivially-copyable automatic variable initialization.")
else()
message(STATUS ">> Compiler does not support -ftrivial-auto-var-init")
set(COMPILER_INIT_FLAG " " CACHE STRING "Using cache trivially-copyable automatic variable initialization.")
endif()
endif()
## Initialize automatic variables with either a pattern or with zeroes to increase program security by preventing
## uninitialized memory disclosure and use. '-ftrivial-auto-var-init=[uninitialized|pattern|zero]' where
## 'uninitialized' is the default, 'pattern' initializes variables with a pattern, and 'zero' initializes variables
## with zeroes.
set(AMD_WORK_BENCH_COMMON_FLAGS "${AMD_WORK_BENCH_COMMON_FLAGS} ${COMPILER_INIT_FLAG}")
endmacro()
macro(setup_compression_flags)
include(CheckCXXCompilerFlag)
include(CheckLinkerFlag)
check_cxx_compiler_flag(-gz=zstd ZSTD_AVAILABLE_COMPILER)
check_linker_flag(CXX -gz=zstd ZSTD_AVAILABLE_LINKER)
check_cxx_compiler_flag(-gz COMPRESS_AVAILABLE_COMPILER)
check_linker_flag(CXX -gz COMPRESS_AVAILABLE_LINKER)
# From cache
if(NOT DEBUG_COMPRESSION_FLAG)
if(ZSTD_AVAILABLE_COMPILER AND ZSTD_AVAILABLE_LINKER)
message(STATUS ">> Compiler and Linker support ZSTD... using it.")
set(DEBUG_COMPRESSION_FLAG "-gz=zstd" CACHE STRING "Using cache for debug info compression.")
elseif(COMPRESS_AVAILABLE_COMPILER AND COMPRESS_AVAILABLE_LINKER)
message(STATUS ">> Compiler and Linker support default compression... using it.")
set(DEBUG_COMPRESSION_FLAG "-gz" CACHE STRING "Using cache for debug info compression.")
endif()
endif()
set(AMD_WORK_BENCH_COMMON_FLAGS "${AMD_WORK_BENCH_COMMON_FLAGS} ${DEBUG_COMPRESSION_FLAG}")
endmacro()
macro(setup_default_compiler_flags target_name)
# Compiler specific flags
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
add_common_flag("-Wall" ${target_name})
add_common_flag("-Wextra" ${target_name})
add_common_flag("-Wno-unused-function" ${target_name})
add_common_flag("-Wno-unused-variable" ${target_name})
add_common_flag("-Wpedantic" ${target_name})
if(TRANSFERBENCH_TREAT_WARNINGS_AS_ERRORS)
add_common_flag("-Werror" ${target_name})
endif()
if(CMAKE_SYSTEM_NAME MATCHES "Linux" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
add_common_flag("-rdynamic" ${target_name})
endif()
##
## -fno-omit-frame-pointer -fno-strict-aliasing -fvisibility=hidden -fvisibility-inlines-hidden
## -fno-exceptions -fno-rtti
add_cxx_flag("-fexceptions" ${target_name})
add_cxx_flag("-frtti" ${target_name})
add_cxx_flag("-fno-omit-frame-pointer" ${target_name})
add_c_cxx_flag("-Wno-array-bounds" ${target_name})
add_c_cxx_flag("-Wno-deprecated-declarations" ${target_name})
add_c_cxx_flag("-Wno-unknown-pragmas" ${target_name})
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
add_c_cxx_flag("-Wno-restrict" ${target_name})
add_c_cxx_flag("-Wno-stringop-overread" ${target_name})
add_c_cxx_flag("-Wno-stringop-overflow" ${target_name})
add_c_cxx_flag("-Wno-dangling-reference" ${target_name})
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
add_c_cxx_flag("-Wno-unknown-warning-option" ${target_name})
endif()
if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
add_common_flag("-O1" ${target_name})
if(TRANSFERBENCH_HARDENING_ENABLED)
## Building with _FORTIFY_SOURCE=3 may impact the size and performance of the code. Since _FORTIFY_SOURCE=2
## generated only constant sizes, its overhead was negligible. However, _FORTIFY_SOURCE=3 may generate
## additional code to compute object sizes. These additions may also cause secondary effects, such as register
## pressure during code generation. Code size tends to increase the size of resultant binaries for the same reason.
##
## _FORTIFY_SOURCE=3 has led to significant gains in security mitigation, but it may not be suitable for all
## applications. We need a proper study of performance and code size to understand the magnitude of the impact
## created by _FORTIFY_SOURCE=3 additional runtime code generation, but the performance, and code size might well
## be worth the magnitude of the security benefits. _FORTIFY_SOURCE requires compiling with optimization (-O).
##
add_common_flag("-U_FORTIFY_SOURCE" ${target_name})
add_common_flag("-D_FORTIFY_SOURCE=2" ${target_name})
## Stack canary check for buffer overflow on the stack.
## Emit extra code to check for buffer overflows, such as stack smashing attacks. This is done by adding a guard
## variable to functions with vulnerable objects. This includes functions that call alloca, and functions with
## buffers larger than or equal to 8 bytes.
## Only variables that are actually allocated on the stack are considered, optimized away variables or variables
## allocated in registers don’t count.
## 'stack-protector-strong' is a stronger version of 'stack-protector', but includes additional functions to be
## protected — those that have local array definitions, or have references to local frame addresses. Only
## variables that are actually allocated on the stack are considered, optimized away variables or variables
## allocated in registers don’t count.
##
add_common_flag("-fstack-protector-strong" ${target_name})
endif()
endif()
if(TRANSFERBENCH_COMPRESS_DEBUG_INFO)
setup_compression_flags()
endif()
## Compiler initialization flags
setup_compiler_init_flags()
## RelWithDebInfo builds, minimum debug info
if (NOT CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
add_c_cxx_flag("-g3" ${target_name})
endif()
## Inline function debugg
if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
add_c_cxx_flag("-ginline-points" ${target_name})
add_c_cxx_flag("-gstatement-frontiers" ${target_name})
endif()
endif()
endif()
## TODO: Check if RPATH settings is needed
endmacro()
macro(developer_status_message message_mode message)
# Note: This macro is used to print developer messages.
has_build_debug_mode(HAS_DEBUG_MODE_ENABLED)
if(HAS_DEBUG_MODE_ENABLED)
# Check for valid message mode
# Note: We will use the 'STATUS' message mode as default if the user doesn't set it or
if(NOT "${message_mode}" MATCHES "^(STATUS|WARNING|ERROR|DEBUG|FATAL_ERROR|DEVEL)$")
message(WARNING "[DEVELOPER]: The '${message_mode}' message mode is not supported for message: '${message}' .")
else()
# ${message_mode} doesn't work here. CMake interpreter sees it as a string; "STATUS", "WARNING"...
if("${message_mode}" STREQUAL "STATUS")
message(STATUS "[DEVELOPER]: ${message}")
elseif("${message_mode}" STREQUAL "WARNING")
message(WARNING "[DEVELOPER]: ${message}")
elseif("${message_mode}" STREQUAL "ERROR")
message(ERROR "[DEVELOPER]: ${message}")
elseif("${message_mode}" STREQUAL "DEBUG")
message(DEBUG "[DEVELOPER]: ${message}")
elseif("${message_mode}" STREQUAL "FATAL_ERROR")
message(FATAL_ERROR "[DEVELOPER]: ${message}")
elseif("${message_mode}" STREQUAL "DEVEL")
message(STATUS "[DEVELOPER]: ${message}")
else()
message(WARNING "[DEVELOPER]: ${message}, with invalid message mode: '${message_mode}'")
endif()
endif()
endif()
endmacro()
# MIT License
#
# Copyright (c) 2023-25 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#
# Test dependencies
#==================================================================================================
include(FetchContent)
set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "")
# Find or download/install rocm-cmake project
#==================================================================================================
find_package(ROCmCMakeBuildTools 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}")
if((NOT ROCmCMakeBuildTools_FOUND) OR INSTALL_DEPENDENCIES)
message(STATUS "ROCmCMakeBuildTools not found. Checking for ROCM (deprecated)")
find_package(ROCM 0.7.3 CONFIG QUIET PATHS "${ROCM_PATH}") # deprecated fallback
if((NOT ROCM_FOUND) OR INSTALL_DEPENDENCIES)
message(STATUS "ROCM (deprecated) not found. Downloading and building ROCmCMakeBuildTools")
set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
set(rocm_cmake_tag "rocm-6.4.0" CACHE STRING "rocm-cmake tag to download")
FetchContent_Declare(
rocm-cmake
GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git
GIT_TAG ${rocm_cmake_tag}
SOURCE_SUBDIR "DISABLE ADDING TO BUILD"
)
FetchContent_MakeAvailable(rocm-cmake)
message(STATUS "rocm-cmake_SOURCE_DIR: ${rocm-cmake_SOURCE_DIR}")
find_package(ROCmCMakeBuildTools CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
endif()
elseif(ROCmCMakeBuildTools_FOUND)
message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
endif()
# Find available local ROCM targets
# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
#==================================================================================================
function(rocm_local_targets VARIABLE)
set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
execute_process(
COMMAND "${_rocm_agent_enumerator}"
RESULT_VARIABLE _found_agents
OUTPUT_VARIABLE _rocm_agents
ERROR_QUIET
)
if (_found_agents EQUAL 0)
string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
unset(result)
foreach (agent IN LISTS _rocm_agents)
if (NOT agent STREQUAL "gfx000")
list(APPEND result "${agent}")
endif()
endforeach()
if(result)
list(REMOVE_DUPLICATES result)
set(${VARIABLE} "${result}" PARENT_SCOPE)
endif()
endif()
endif()
endfunction()
include(ROCMSetupVersion)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
include(ROCMPackageConfigHelpers)
include(ROCMInstallSymlinks)
include(ROCMCheckTargetIds)
include(ROCMClients)
include(ROCMHeaderWrapper)
# Copyright (c) 2020 - 2021 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
find_path(NUMA_INCLUDE_DIR numa.h)
find_library(NUMA_LIBRARIES numa)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NUMA
DEFAULT_MSG
NUMA_LIBRARIES NUMA_INCLUDE_DIR)
mark_as_advanced(NUMA_LIBRARIES NUMA_INCLUDE_DIR)
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying
# file Copyright.txt or https://cmake.org/licensing for details.
#[=======================================================================[.rst:
FindPackageHandleStandardArgs
-----------------------------
This module provides functions intended to be used in :ref:`Find Modules`
implementing :command:`find_package(<PackageName>)` calls.
.. command:: find_package_handle_standard_args
This command handles the ``REQUIRED``, ``QUIET`` and version-related
arguments of :command:`find_package`. It also sets the
``<PackageName>_FOUND`` variable. The package is considered found if all
variables listed contain valid results, e.g. valid filepaths.
There are two signatures:
.. code-block:: cmake
find_package_handle_standard_args(<PackageName>
(DEFAULT_MSG|<custom-failure-message>)
<required-var>...
)
find_package_handle_standard_args(<PackageName>
[FOUND_VAR <result-var>]
[REQUIRED_VARS <required-var>...]
[VERSION_VAR <version-var>]
[HANDLE_VERSION_RANGE]
[HANDLE_COMPONENTS]
[CONFIG_MODE]
[NAME_MISMATCHED]
[REASON_FAILURE_MESSAGE <reason-failure-message>]
[FAIL_MESSAGE <custom-failure-message>]
)
The ``<PackageName>_FOUND`` variable will be set to ``TRUE`` if all
the variables ``<required-var>...`` are valid and any optional
constraints are satisfied, and ``FALSE`` otherwise. A success or
failure message may be displayed based on the results and on
whether the ``REQUIRED`` and/or ``QUIET`` option was given to
the :command:`find_package` call.
The options are:
``(DEFAULT_MSG|<custom-failure-message>)``
In the simple signature this specifies the failure message.
Use ``DEFAULT_MSG`` to ask for a default message to be computed
(recommended). Not valid in the full signature.
``FOUND_VAR <result-var>``
.. deprecated:: 3.3
Specifies either ``<PackageName>_FOUND`` or
``<PACKAGENAME>_FOUND`` as the result variable. This exists only
for compatibility with older versions of CMake and is now ignored.
Result variables of both names are always set for compatibility.
``REQUIRED_VARS <required-var>...``
Specify the variables which are required for this package.
These may be named in the generated failure message asking the
user to set the missing variable values. Therefore these should
typically be cache entries such as ``FOO_LIBRARY`` and not output
variables like ``FOO_LIBRARIES``.
.. versionchanged:: 3.18
If ``HANDLE_COMPONENTS`` is specified, this option can be omitted.
``VERSION_VAR <version-var>``
Specify the name of a variable that holds the version of the package
that has been found. This version will be checked against the
(potentially) specified required version given to the
:command:`find_package` call, including its ``EXACT`` option.
The default messages include information about the required
version and the version which has been actually found, both
if the version is ok or not.
``HANDLE_VERSION_RANGE``
.. versionadded:: 3.19
Enable handling of a version range, if one is specified. Without this
option, a developer warning will be displayed if a version range is
specified.
``HANDLE_COMPONENTS``
Enable handling of package components. In this case, the command
will report which components have been found and which are missing,
and the ``<PackageName>_FOUND`` variable will be set to ``FALSE``
if any of the required components (i.e. not the ones listed after
the ``OPTIONAL_COMPONENTS`` option of :command:`find_package`) are
missing.
``CONFIG_MODE``
Specify that the calling find module is a wrapper around a
call to ``find_package(<PackageName> NO_MODULE)``. This implies
a ``VERSION_VAR`` value of ``<PackageName>_VERSION``. The command
will automatically check whether the package configuration file
was found.
``REASON_FAILURE_MESSAGE <reason-failure-message>``
.. versionadded:: 3.16
Specify a custom message of the reason for the failure which will be
appended to the default generated message.
``FAIL_MESSAGE <custom-failure-message>``
Specify a custom failure message instead of using the default
generated message. Not recommended.
``NAME_MISMATCHED``
.. versionadded:: 3.17
Indicate that the ``<PackageName>`` does not match
``${CMAKE_FIND_PACKAGE_NAME}``. This is usually a mistake and raises a
warning, but it may be intentional for usage of the command for components
of a larger package.
Example for the simple signature:
.. code-block:: cmake
find_package_handle_standard_args(LibXml2 DEFAULT_MSG
LIBXML2_LIBRARY LIBXML2_INCLUDE_DIR)
The ``LibXml2`` package is considered to be found if both
``LIBXML2_LIBRARY`` and ``LIBXML2_INCLUDE_DIR`` are valid.
Then also ``LibXml2_FOUND`` is set to ``TRUE``. If it is not found
and ``REQUIRED`` was used, it fails with a
:command:`message(FATAL_ERROR)`, independent whether ``QUIET`` was
used or not. If it is found, success will be reported, including
the content of the first ``<required-var>``. On repeated CMake runs,
the same message will not be printed again.
.. note::
If ``<PackageName>`` does not match ``CMAKE_FIND_PACKAGE_NAME`` for the
calling module, a warning that there is a mismatch is given. The
``FPHSA_NAME_MISMATCHED`` variable may be set to bypass the warning if using
the old signature and the ``NAME_MISMATCHED`` argument using the new
signature. To avoid forcing the caller to require newer versions of CMake for
usage, the variable's value will be used if defined when the
``NAME_MISMATCHED`` argument is not passed for the new signature (but using
both is an error)..
Example for the full signature:
.. code-block:: cmake
find_package_handle_standard_args(LibArchive
REQUIRED_VARS LibArchive_LIBRARY LibArchive_INCLUDE_DIR
VERSION_VAR LibArchive_VERSION)
In this case, the ``LibArchive`` package is considered to be found if
both ``LibArchive_LIBRARY`` and ``LibArchive_INCLUDE_DIR`` are valid.
Also the version of ``LibArchive`` will be checked by using the version
contained in ``LibArchive_VERSION``. Since no ``FAIL_MESSAGE`` is given,
the default messages will be printed.
Another example for the full signature:
.. code-block:: cmake
find_package(Automoc4 QUIET NO_MODULE HINTS /opt/automoc4)
find_package_handle_standard_args(Automoc4 CONFIG_MODE)
In this case, a ``FindAutmoc4.cmake`` module wraps a call to
``find_package(Automoc4 NO_MODULE)`` and adds an additional search
directory for ``automoc4``. Then the call to
``find_package_handle_standard_args`` produces a proper success/failure
message.
.. command:: find_package_check_version
.. versionadded:: 3.19
Helper function which can be used to check if a ``<version>`` is valid
against version-related arguments of :command:`find_package`.
.. code-block:: cmake
find_package_check_version(<version> <result-var>
[HANDLE_VERSION_RANGE]
[RESULT_MESSAGE_VARIABLE <message-var>]
)
The ``<result-var>`` will hold a boolean value giving the result of the check.
The options are:
``HANDLE_VERSION_RANGE``
Enable handling of a version range, if one is specified. Without this
option, a developer warning will be displayed if a version range is
specified.
``RESULT_MESSAGE_VARIABLE <message-var>``
Specify a variable to get back a message describing the result of the check.
Example for the usage:
.. code-block:: cmake
find_package_check_version(1.2.3 result HANDLE_VERSION_RANGE
RESULT_MESSAGE_VARIABLE reason)
if (result)
message (STATUS "${reason}")
else()
message (FATAL_ERROR "${reason}")
endif()
#]=======================================================================]
include(${CMAKE_CURRENT_LIST_DIR}/FindPackageMessage.cmake)
cmake_policy(PUSH)
# numbers and boolean constants
cmake_policy (SET CMP0012 NEW)
# IN_LIST operator
cmake_policy (SET CMP0057 NEW)
# internal helper macro
macro(_FPHSA_FAILURE_MESSAGE _msg)
set (__msg "${_msg}")
if (FPHSA_REASON_FAILURE_MESSAGE)
string(APPEND __msg "\n Reason given by package: ${FPHSA_REASON_FAILURE_MESSAGE}\n")
endif()
if (${_NAME}_FIND_REQUIRED)
message(FATAL_ERROR "${__msg}")
else ()
if (NOT ${_NAME}_FIND_QUIETLY)
message(STATUS "${__msg}")
endif ()
endif ()
endmacro()
# internal helper macro to generate the failure message when used in CONFIG_MODE:
macro(_FPHSA_HANDLE_FAILURE_CONFIG_MODE)
# <PackageName>_CONFIG is set, but FOUND is false, this means that some other of the REQUIRED_VARS was not found:
if(${_NAME}_CONFIG)
_FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: missing:${MISSING_VARS} (found ${${_NAME}_CONFIG} ${VERSION_MSG})")
else()
# If _CONSIDERED_CONFIGS is set, the config-file has been found, but no suitable version.
# List them all in the error message:
if(${_NAME}_CONSIDERED_CONFIGS)
set(configsText "")
list(LENGTH ${_NAME}_CONSIDERED_CONFIGS configsCount)
math(EXPR configsCount "${configsCount} - 1")
foreach(currentConfigIndex RANGE ${configsCount})
list(GET ${_NAME}_CONSIDERED_CONFIGS ${currentConfigIndex} filename)
list(GET ${_NAME}_CONSIDERED_VERSIONS ${currentConfigIndex} version)
string(APPEND configsText "\n ${filename} (version ${version})")
endforeach()
if (${_NAME}_NOT_FOUND_MESSAGE)
if (FPHSA_REASON_FAILURE_MESSAGE)
string(PREPEND FPHSA_REASON_FAILURE_MESSAGE "${${_NAME}_NOT_FOUND_MESSAGE}\n ")
else()
set(FPHSA_REASON_FAILURE_MESSAGE "${${_NAME}_NOT_FOUND_MESSAGE}")
endif()
else()
string(APPEND configsText "\n")
endif()
_FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} ${VERSION_MSG}, checked the following files:${configsText}")
else()
# Simple case: No Config-file was found at all:
_FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: found neither ${_NAME}Config.cmake nor ${_NAME_LOWER}-config.cmake ${VERSION_MSG}")
endif()
endif()
endmacro()
function(FIND_PACKAGE_CHECK_VERSION version result)
cmake_parse_arguments (PARSE_ARGV 2 FPCV "HANDLE_VERSION_RANGE;NO_AUTHOR_WARNING_VERSION_RANGE" "RESULT_MESSAGE_VARIABLE" "")
if (FPCV_UNPARSED_ARGUMENTS)
message (FATAL_ERROR "find_package_check_version(): ${FPCV_UNPARSED_ARGUMENTS}: unexpected arguments")
endif()
if ("RESULT_MESSAGE_VARIABLE" IN_LIST FPCV_KEYWORDS_MISSING_VALUES)
message (FATAL_ERROR "find_package_check_version(): RESULT_MESSAGE_VARIABLE expects an argument")
endif()
set (${result} FALSE PARENT_SCOPE)
if (FPCV_RESULT_MESSAGE_VARIABLE)
unset (${FPCV_RESULT_MESSAGE_VARIABLE} PARENT_SCOPE)
endif()
if (_CMAKE_FPHSA_PACKAGE_NAME)
set (package "${_CMAKE_FPHSA_PACKAGE_NAME}")
elseif (CMAKE_FIND_PACKAGE_NAME)
set (package "${CMAKE_FIND_PACKAGE_NAME}")
else()
message (FATAL_ERROR "find_package_check_version(): Cannot be used outside a 'Find Module'")
endif()
if (NOT FPCV_NO_AUTHOR_WARNING_VERSION_RANGE
AND ${package}_FIND_VERSION_RANGE AND NOT FPCV_HANDLE_VERSION_RANGE)
message(AUTHOR_WARNING
"`find_package()` specify a version range but the option "
"HANDLE_VERSION_RANGE` is not passed to `find_package_check_version()`. "
"Only the lower endpoint of the range will be used.")
endif()
set (version_ok FALSE)
unset (version_msg)
if (FPCV_HANDLE_VERSION_RANGE AND ${package}_FIND_VERSION_RANGE)
if ((${package}_FIND_VERSION_RANGE_MIN STREQUAL "INCLUDE"
AND version VERSION_GREATER_EQUAL ${package}_FIND_VERSION_MIN)
AND ((${package}_FIND_VERSION_RANGE_MAX STREQUAL "INCLUDE"
AND version VERSION_LESS_EQUAL ${package}_FIND_VERSION_MAX)
OR (${package}_FIND_VERSION_RANGE_MAX STREQUAL "EXCLUDE"
AND version VERSION_LESS ${package}_FIND_VERSION_MAX)))
set (version_ok TRUE)
set(version_msg "(found suitable version \"${version}\", required range is \"${${package}_FIND_VERSION_RANGE}\")")
else()
set(version_msg "Found unsuitable version \"${version}\", required range is \"${${package}_FIND_VERSION_RANGE}\"")
endif()
elseif (DEFINED ${package}_FIND_VERSION)
if(${package}_FIND_VERSION_EXACT) # exact version required
# count the dots in the version string
string(REGEX REPLACE "[^.]" "" version_dots "${version}")
# add one dot because there is one dot more than there are components
string(LENGTH "${version_dots}." version_dots)
if (version_dots GREATER ${package}_FIND_VERSION_COUNT)
# Because of the C++ implementation of find_package() ${package}_FIND_VERSION_COUNT
# is at most 4 here. Therefore a simple lookup table is used.
if (${package}_FIND_VERSION_COUNT EQUAL 1)
set(version_regex "[^.]*")
elseif (${package}_FIND_VERSION_COUNT EQUAL 2)
set(version_regex "[^.]*\\.[^.]*")
elseif (${package}_FIND_VERSION_COUNT EQUAL 3)
set(version_regex "[^.]*\\.[^.]*\\.[^.]*")
else()
set(version_regex "[^.]*\\.[^.]*\\.[^.]*\\.[^.]*")
endif()
string(REGEX REPLACE "^(${version_regex})\\..*" "\\1" version_head "${version}")
if (NOT ${package}_FIND_VERSION VERSION_EQUAL version_head)
set(version_msg "Found unsuitable version \"${version}\", but required is exact version \"${${package}_FIND_VERSION}\"")
else ()
set(version_ok TRUE)
set(version_msg "(found suitable exact version \"${_FOUND_VERSION}\")")
endif ()
else ()
if (NOT ${package}_FIND_VERSION VERSION_EQUAL version)
set(version_msg "Found unsuitable version \"${version}\", but required is exact version \"${${package}_FIND_VERSION}\"")
else ()
set(version_ok TRUE)
set(version_msg "(found suitable exact version \"${version}\")")
endif ()
endif ()
else() # minimum version
if (${package}_FIND_VERSION VERSION_GREATER version)
set(version_msg "Found unsuitable version \"${version}\", but required is at least \"${${package}_FIND_VERSION}\"")
else()
set(version_ok TRUE)
set(version_msg "(found suitable version \"${version}\", minimum required is \"${${package}_FIND_VERSION}\")")
endif()
endif()
else ()
set(version_ok TRUE)
set(version_msg "(found version \"${version}\")")
endif()
set (${result} ${version_ok} PARENT_SCOPE)
if (FPCV_RESULT_MESSAGE_VARIABLE)
set (${FPCV_RESULT_MESSAGE_VARIABLE} "${version_msg}" PARENT_SCOPE)
endif()
endfunction()
function(FIND_PACKAGE_HANDLE_STANDARD_ARGS _NAME _FIRST_ARG)
# Set up the arguments for `cmake_parse_arguments`.
set(options CONFIG_MODE HANDLE_COMPONENTS NAME_MISMATCHED HANDLE_VERSION_RANGE)
set(oneValueArgs FAIL_MESSAGE REASON_FAILURE_MESSAGE VERSION_VAR FOUND_VAR)
set(multiValueArgs REQUIRED_VARS)
# Check whether we are in 'simple' or 'extended' mode:
set(_KEYWORDS_FOR_EXTENDED_MODE ${options} ${oneValueArgs} ${multiValueArgs} )
list(FIND _KEYWORDS_FOR_EXTENDED_MODE "${_FIRST_ARG}" INDEX)
unset(FPHSA_NAME_MISMATCHED_override)
if (DEFINED FPHSA_NAME_MISMATCHED)
# If the variable NAME_MISMATCHED variable is set, error if it is passed as
# an argument. The former is for old signatures, the latter is for new
# signatures.
list(FIND ARGN "NAME_MISMATCHED" name_mismatched_idx)
if (NOT name_mismatched_idx EQUAL "-1")
message(FATAL_ERROR
"The `NAME_MISMATCHED` argument may only be specified by the argument or "
"the variable, not both.")
endif ()
# But use the variable if it is not an argument to avoid forcing minimum
# CMake version bumps for calling modules.
set(FPHSA_NAME_MISMATCHED_override "${FPHSA_NAME_MISMATCHED}")
endif ()
if(${INDEX} EQUAL -1)
set(FPHSA_FAIL_MESSAGE ${_FIRST_ARG})
set(FPHSA_REQUIRED_VARS ${ARGN})
set(FPHSA_VERSION_VAR)
else()
cmake_parse_arguments(FPHSA "${options}" "${oneValueArgs}" "${multiValueArgs}" ${_FIRST_ARG} ${ARGN})
if(FPHSA_UNPARSED_ARGUMENTS)
message(FATAL_ERROR "Unknown keywords given to FIND_PACKAGE_HANDLE_STANDARD_ARGS(): \"${FPHSA_UNPARSED_ARGUMENTS}\"")
endif()
if(NOT FPHSA_FAIL_MESSAGE)
set(FPHSA_FAIL_MESSAGE "DEFAULT_MSG")
endif()
# In config-mode, we rely on the variable <PackageName>_CONFIG, which is set by find_package()
# when it successfully found the config-file, including version checking:
if(FPHSA_CONFIG_MODE)
list(INSERT FPHSA_REQUIRED_VARS 0 ${_NAME}_CONFIG)
list(REMOVE_DUPLICATES FPHSA_REQUIRED_VARS)
set(FPHSA_VERSION_VAR ${_NAME}_VERSION)
endif()
if(NOT FPHSA_REQUIRED_VARS AND NOT FPHSA_HANDLE_COMPONENTS)
message(FATAL_ERROR "No REQUIRED_VARS specified for FIND_PACKAGE_HANDLE_STANDARD_ARGS()")
endif()
endif()
if (DEFINED FPHSA_NAME_MISMATCHED_override)
set(FPHSA_NAME_MISMATCHED "${FPHSA_NAME_MISMATCHED_override}")
endif ()
if (DEFINED CMAKE_FIND_PACKAGE_NAME
AND NOT FPHSA_NAME_MISMATCHED
AND NOT _NAME STREQUAL CMAKE_FIND_PACKAGE_NAME)
message(AUTHOR_WARNING
"The package name passed to `find_package_handle_standard_args` "
"(${_NAME}) does not match the name of the calling package "
"(${CMAKE_FIND_PACKAGE_NAME}). This can lead to problems in calling "
"code that expects `find_package` result variables (e.g., `_FOUND`) "
"to follow a certain pattern.")
endif ()
if (${_NAME}_FIND_VERSION_RANGE AND NOT FPHSA_HANDLE_VERSION_RANGE)
message(AUTHOR_WARNING
"`find_package()` specify a version range but the module ${_NAME} does "
"not support this capability. Only the lower endpoint of the range "
"will be used.")
endif()
# to propagate package name to FIND_PACKAGE_CHECK_VERSION
set(_CMAKE_FPHSA_PACKAGE_NAME "${_NAME}")
# now that we collected all arguments, process them
if("x${FPHSA_FAIL_MESSAGE}" STREQUAL "xDEFAULT_MSG")
set(FPHSA_FAIL_MESSAGE "Could NOT find ${_NAME}")
endif()
if (FPHSA_REQUIRED_VARS)
list(GET FPHSA_REQUIRED_VARS 0 _FIRST_REQUIRED_VAR)
endif()
string(TOUPPER ${_NAME} _NAME_UPPER)
string(TOLOWER ${_NAME} _NAME_LOWER)
if(FPHSA_FOUND_VAR)
set(_FOUND_VAR_UPPER ${_NAME_UPPER}_FOUND)
set(_FOUND_VAR_MIXED ${_NAME}_FOUND)
if(FPHSA_FOUND_VAR STREQUAL _FOUND_VAR_MIXED OR FPHSA_FOUND_VAR STREQUAL _FOUND_VAR_UPPER)
set(_FOUND_VAR ${FPHSA_FOUND_VAR})
else()
message(FATAL_ERROR "The argument for FOUND_VAR is \"${FPHSA_FOUND_VAR}\", but only \"${_FOUND_VAR_MIXED}\" and \"${_FOUND_VAR_UPPER}\" are valid names.")
endif()
else()
set(_FOUND_VAR ${_NAME_UPPER}_FOUND)
endif()
# collect all variables which were not found, so they can be printed, so the
# user knows better what went wrong (#6375)
set(MISSING_VARS "")
set(DETAILS "")
# check if all passed variables are valid
set(FPHSA_FOUND_${_NAME} TRUE)
foreach(_CURRENT_VAR ${FPHSA_REQUIRED_VARS})
if(NOT ${_CURRENT_VAR})
set(FPHSA_FOUND_${_NAME} FALSE)
string(APPEND MISSING_VARS " ${_CURRENT_VAR}")
else()
string(APPEND DETAILS "[${${_CURRENT_VAR}}]")
endif()
endforeach()
if(FPHSA_FOUND_${_NAME})
set(${_NAME}_FOUND TRUE)
set(${_NAME_UPPER}_FOUND TRUE)
else()
set(${_NAME}_FOUND FALSE)
set(${_NAME_UPPER}_FOUND FALSE)
endif()
# component handling
unset(FOUND_COMPONENTS_MSG)
unset(MISSING_COMPONENTS_MSG)
if(FPHSA_HANDLE_COMPONENTS)
foreach(comp ${${_NAME}_FIND_COMPONENTS})
if(${_NAME}_${comp}_FOUND)
if(NOT DEFINED FOUND_COMPONENTS_MSG)
set(FOUND_COMPONENTS_MSG "found components:")
endif()
string(APPEND FOUND_COMPONENTS_MSG " ${comp}")
else()
if(NOT DEFINED MISSING_COMPONENTS_MSG)
set(MISSING_COMPONENTS_MSG "missing components:")
endif()
string(APPEND MISSING_COMPONENTS_MSG " ${comp}")
if(${_NAME}_FIND_REQUIRED_${comp})
set(${_NAME}_FOUND FALSE)
string(APPEND MISSING_VARS " ${comp}")
endif()
endif()
endforeach()
set(COMPONENT_MSG "${FOUND_COMPONENTS_MSG} ${MISSING_COMPONENTS_MSG}")
string(APPEND DETAILS "[c${COMPONENT_MSG}]")
endif()
# version handling:
set(VERSION_MSG "")
set(VERSION_OK TRUE)
# check that the version variable is not empty to avoid emitting a misleading
# message (i.e. `Found unsuitable version ""`)
if (DEFINED ${_NAME}_FIND_VERSION)
if(DEFINED ${FPHSA_VERSION_VAR})
if(NOT "${${FPHSA_VERSION_VAR}}" STREQUAL "")
set(_FOUND_VERSION ${${FPHSA_VERSION_VAR}})
if (FPHSA_HANDLE_VERSION_RANGE)
set (FPCV_HANDLE_VERSION_RANGE HANDLE_VERSION_RANGE)
else()
set(FPCV_HANDLE_VERSION_RANGE NO_AUTHOR_WARNING_VERSION_RANGE)
endif()
find_package_check_version ("${_FOUND_VERSION}" VERSION_OK RESULT_MESSAGE_VARIABLE VERSION_MSG
${FPCV_HANDLE_VERSION_RANGE})
else()
set(VERSION_OK FALSE)
endif()
endif()
if("${${FPHSA_VERSION_VAR}}" STREQUAL "")
# if the package was not found, but a version was given, add that to the output:
if(${_NAME}_FIND_VERSION_EXACT)
set(VERSION_MSG "(Required is exact version \"${${_NAME}_FIND_VERSION}\")")
elseif (FPHSA_HANDLE_VERSION_RANGE AND ${_NAME}_FIND_VERSION_RANGE)
set(VERSION_MSG "(Required is version range \"${${_NAME}_FIND_VERSION_RANGE}\")")
else()
set(VERSION_MSG "(Required is at least version \"${${_NAME}_FIND_VERSION}\")")
endif()
endif()
else ()
# Check with DEFINED as the found version may be 0.
if(DEFINED ${FPHSA_VERSION_VAR})
set(VERSION_MSG "(found version \"${${FPHSA_VERSION_VAR}}\")")
endif()
endif ()
if(VERSION_OK)
string(APPEND DETAILS "[v${${FPHSA_VERSION_VAR}}(${${_NAME}_FIND_VERSION})]")
else()
set(${_NAME}_FOUND FALSE)
endif()
# print the result:
if (${_NAME}_FOUND)
FIND_PACKAGE_MESSAGE(${_NAME} "Found ${_NAME}: ${${_FIRST_REQUIRED_VAR}} ${VERSION_MSG} ${COMPONENT_MSG}" "${DETAILS}")
else ()
if(FPHSA_CONFIG_MODE)
_FPHSA_HANDLE_FAILURE_CONFIG_MODE()
else()
if(NOT VERSION_OK)
set(RESULT_MSG)
if (_FIRST_REQUIRED_VAR)
string (APPEND RESULT_MSG "found ${${_FIRST_REQUIRED_VAR}}")
endif()
if (COMPONENT_MSG)
if (RESULT_MSG)
string (APPEND RESULT_MSG ", ")
endif()
string (APPEND RESULT_MSG "${FOUND_COMPONENTS_MSG}")
endif()
_FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE}: ${VERSION_MSG} (${RESULT_MSG})")
else()
_FPHSA_FAILURE_MESSAGE("${FPHSA_FAIL_MESSAGE} (missing:${MISSING_VARS}) ${VERSION_MSG}")
endif()
endif()
endif ()
set(${_NAME}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
set(${_NAME_UPPER}_FOUND ${${_NAME}_FOUND} PARENT_SCOPE)
endfunction()
cmake_policy(POP)
\ No newline at end of file
# Distributed under the OSI-approved BSD 3-Clause License. See accompanying
# file Copyright.txt or https://cmake.org/licensing for details.
#[=======================================================================[.rst:
FindPackageMessage
------------------
.. code-block:: cmake
find_package_message(<name> "message for user" "find result details")
This function is intended to be used in FindXXX.cmake modules files.
It will print a message once for each unique find result. This is
useful for telling the user where a package was found. The first
argument specifies the name (XXX) of the package. The second argument
specifies the message to display. The third argument lists details
about the find result so that if they change the message will be
displayed again. The macro also obeys the QUIET argument to the
find_package command.
Example:
.. code-block:: cmake
if(X11_FOUND)
find_package_message(X11 "Found X11: ${X11_X11_LIB}"
"[${X11_X11_LIB}][${X11_INCLUDE_DIR}]")
else()
...
endif()
#]=======================================================================]
function(find_package_message pkg msg details)
# Avoid printing a message repeatedly for the same find result.
if(NOT ${pkg}_FIND_QUIETLY)
string(REPLACE "\n" "" details "${details}")
set(DETAILS_VAR FIND_PACKAGE_MESSAGE_DETAILS_${pkg})
if(NOT "${details}" STREQUAL "${${DETAILS_VAR}}")
# The message has not yet been printed.
message(STATUS "${msg}")
# Save the find details in the cache to avoid printing the same
# message again.
set("${DETAILS_VAR}" "${details}"
CACHE INTERNAL "Details about finding ${pkg}")
endif()
endif()
endfunction()
\ No newline at end of file
# MIT License
#
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#
#
# --- CMake Toolchain File for using Clang from ROCm build environment ---
#
# To use this, invoke CMake like this:
# export ROCM_INSTALL_PATH=/opt/rocm[-${rocm_version}] # Or ROCm path: ROCM_PATH
# example: export ROCM_INSTALL_PATH=/opt/rocm-6.5.0
# export ROCM_INSTALL_PATH=/opt/rocm
# export ROCM_INSTALL_PATH=$ROCM_PATH
#
# cmake -DCMAKE_TOOLCHAIN_FILE=./src/cmake/rocm-clang-toolchain.cmake ...
#
# This toolchain file assumes you are building for the host system (e.g., Linux x86_64)
# but specifically using the Clang toolchain provided with ROCm.
#
#
cmake_minimum_required(VERSION 3.25)
#
# --- CMake OS version checkpoint ---
#
# For some distros/versions, the default compiler and std library versions are not
# up to the minimum C++20 (or newer). As those cannot be updated and as our compiler
# 'Lightning/Clang++' is *built without* its 'libc++' component, we are not able to
# use some much needed features part of the source code.
# Here we check for those distros/versions, so we can skip the build gracefully with
# no build failures.
#
# For now, we are checking for:
# NAME="Red Hat Enterprise Linux"
# VERSION_ID="8.8"
# ||
# NAME="Debian GNU/Linux"
# VERSION_ID="10"
#
# Note: CMake regex does not support multiline mode by default. So '^' and '$' only
# match the beginning and end of the entire string, not the start and end of
# individual lines.
# string(REGEX MATCH "NAME=\"?([^\n\"]+)\"?" _ "${OS_RELEASE_FILE_INFO}") will cause
# errors when we have:
# PRETTY_NAME="Debian GNU/Linux 10 (buster)"
# NAME="Debian GNU/Linux"
# We will try to fix it with (prepending a newline manually, simulating line-by-line):
# string(REGEX MATCH "\nNAME=\"([^\"]+)\"" _name_match "\n${OS_RELEASE_FILE_INFO}")
#
# --- ROCm default compiler/toolchain ---
# If already set, skip further processing.
if(IS_LIGHTNING_CLANG_DEFAULT_COMPILER AND ROCM_CLANG_TOOLCHAIN_USED)
message(STATUS ">> ROCm 'Lightning Clang++' toolchain is already set as default compiler.")
return()
endif()
set(SKIP_BUILD_PROCESS OFF)
set(OS_RELEASE_FILE "/etc/os-release")
if(EXISTS ${OS_RELEASE_FILE})
file(READ "${OS_RELEASE_FILE}" OS_RELEASE_FILE_INFO)
string(REGEX MATCH "\nNAME=\"([^\"]+)\"" _name_match "\n${OS_RELEASE_FILE_INFO}")
set(DISTRO_NAME "${CMAKE_MATCH_1}")
string(REGEX MATCH "\nVERSION_ID=\"([^\"]+)\"" _version_match "\n${OS_RELEASE_FILE_INFO}")
set(DISTRO_VERSION_ID "${CMAKE_MATCH_1}")
message(STATUS ">> ROCm Clang Toolchain Environment Detected: '${DISTRO_NAME}', v'${DISTRO_VERSION_ID}'")
## Check for unsupported distros/versions
## That is, distros/versions with compilers and std libraries not supporting C++20 fully.
if((DISTRO_NAME STREQUAL "Debian GNU/Linux" AND (DISTRO_VERSION_ID VERSION_GREATER_EQUAL "10")))
# CACHE INTERNAL makes sure the SKIP_BUILD_PROCESS variable survives into the main CMake context
set(SKIP_BUILD_PROCESS ON CACHE INTERNAL "Skip build process for this OS version")
file(WRITE "${CMAKE_BINARY_DIR}/rbt_skip_build_process.flag" "1")
message(WARNING ">> Build not supported: '${DISTRO_NAME}', v'${DISTRO_VERSION_ID}'")
endif()
else()
set(SKIP_BUILD_PROCESS ON)
message(WARNING ">> Unable to read OS release file: '${OS_RELEASE_FILE}'")
endif()
#
# --- ROCm Build Path Setup ---
if(DEFINED ENV{ROCM_INSTALL_PATH})
set(ROCM_BASE_PATH "$ENV{ROCM_INSTALL_PATH}")
elseif(DEFINED ENV{ROCM_PATH})
set(ROCM_BASE_PATH "$ENV{ROCM_PATH}")
else()
message(FATAL_ERROR ">> No ROCM_INSTALL_PATH or ROCM_PATH environment variable is set. "
" That is a requirement to locate 'Lightning Clang++'")
endif()
#
# --- Path to Clang/LLVM root directory, (ie: /opt/rocm/lib/llvm/) ---
if(DEFINED ENV{ROCM_LLVM_PATH})
set(ROCM_LLVM_BIN_DIR "$ENV{ROCM_LLVM_PATH}/bin")
else()
set(ROCM_LLVM_BIN_DIR "${ROCM_BASE_PATH}/lib/llvm/bin")
endif()
set(ROCM_BIN_DIR "${ROCM_BASE_PATH}/bin")
message(STATUS ">> ROCM_INSTALL_PATH detected: '${ROCM_BASE_PATH}'")
message(STATUS ">> Expecting Clang/LLVM tools in: '${ROCM_LLVM_BIN_DIR}'")
if(NOT IS_DIRECTORY "${ROCM_LLVM_BIN_DIR}")
message(FATAL_ERROR ">> ROCM_LLVM_BIN_DIR is not a valid directory: '${ROCM_LLVM_BIN_DIR}'\n"
" Check ROCM_INSTALL_PATH and the LLVM binary path structure.")
endif()
#
# --- Compilers and Tools ---
# Find Clang C and C++ compilers within the ROCm LLVM binary directory
# NO_DEFAULT_PATH ensures CMake only looks in the HINTS path first for these specific finds.
# REQUIRED will cause CMake to stop with an error if the compiler is not found there.
find_program(CMAKE_C_COMPILER
NAMES clang
HINTS "${ROCM_LLVM_BIN_DIR}"
NO_DEFAULT_PATH
REQUIRED
)
find_program(CMAKE_CXX_COMPILER
NAMES clang++
HINTS "${ROCM_LLVM_BIN_DIR}"
NO_DEFAULT_PATH
REQUIRED
)
find_program(AMD_CLANG_CXX_COMPILER
NAMES amdclang++
HINTS "${ROCM_LLVM_BIN_DIR}"
NO_DEFAULT_PATH
REQUIRED
)
find_program(AMD_HIP_CXX_COMPILER
NAMES hipcc
HINTS "${ROCM_BIN_DIR}"
NO_DEFAULT_PATH
REQUIRED
)
# --- Verify hipcc/Clang compiler version ---
set(CMAKE_C_COMPILER ${AMD_HIP_CXX_COMPILER})
set(CMAKE_CXX_COMPILER ${AMD_HIP_CXX_COMPILER})
# Minimum required version of Clang
if(CMAKE_CXX_COMPILER)
set(CLANG_COMPILER_MAJOR_VERSION_REQUIRED "19")
set(CLANG_COMPILER_MINOR_VERSION_REQUIRED "0")
set(CLANG_COMPILER_REVISION_VERSION_REQUIRED "0")
set(CLANG_COMPILER_MINIMUM_VERSION_REQUIRED "${CLANG_COMPILER_MAJOR_VERSION_REQUIRED}.${CLANG_COMPILER_MINOR_VERSION_REQUIRED}.${CLANG_COMPILER_REVISION_VERSION_REQUIRED}")
execute_process(
COMMAND ${CMAKE_CXX_COMPILER} -dumpversion
OUTPUT_VARIABLE CLANG_COMPILER_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE
)
# Check if the version is valid
string(REGEX MATCHALL "[0-9]+" CLANG_COMPILER_VERSION_COMPONENTS "${CLANG_COMPILER_VERSION}")
if(CLANG_COMPILER_VERSION_COMPONENTS)
list(GET CLANG_COMPILER_VERSION_COMPONENTS 0 CLANG_COMPILER_VERSION_MAJOR)
list(GET CLANG_COMPILER_VERSION_COMPONENTS 1 CLANG_COMPILER_VERSION_MINOR)
list(GET CLANG_COMPILER_VERSION_COMPONENTS 2 CLANG_COMPILER_VERSION_REVISION)
set(CLANG_COMPILER_FULL_VERSION "${CLANG_COMPILER_VERSION_MAJOR}.${CLANG_COMPILER_VERSION_MINOR}.${CLANG_COMPILER_VERSION_REVISION}")
##
if(CLANG_COMPILER_VERSION_MAJOR GREATER_EQUAL ${CLANG_COMPILER_MAJOR_VERSION_REQUIRED} AND
CLANG_COMPILER_VERSION_MINOR GREATER_EQUAL ${CLANG_COMPILER_MINOR_VERSION_REQUIRED})
set(CLANG_COMPILER_VERSION_RESULT TRUE)
else()
set(CLANG_COMPILER_VERSION_RESULT FALSE)
endif()
if(NOT CLANG_COMPILER_VERSION_RESULT)
message(FATAL_ERROR ">> 'Clang++' compiler v'${CLANG_COMPILER_VERSION}' is not as default compiler! Minimum version required: 'v${CLANG_COMPILER_MINIMUM_VERSION_REQUIRED}'")
endif()
endif()
else()
message(FATAL_ERROR ">> 'Clang++' compiler not found in ROCM_INSTALL_PATH: '${ROCM_BASE_PATH}'")
endif()
#
# --- Search Behavior ---
# For ROCm, the ROCM_PATH itself is a root for its specific components (headers, libs).
# We add it to CMAKE_FIND_ROOT_PATH so find_package, find_library etc., look there.
# We use list(PREPEND ...) to ensure ROCM_PATH is searched before system paths for relevant items.
list(PREPEND CMAKE_FIND_ROOT_PATH "${ROCM_BASE_PATH}")
list(REMOVE_DUPLICATES CMAKE_FIND_ROOT_PATH)
# Adjust find behavior.
# 'BOTH' allows searching in CMAKE_FIND_ROOT_PATH (ROCm paths) and then system paths.
# This is often suitable for ROCm which overlays on a standard system.
set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) # Don't look for host programs in ROCM_PATH
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY BOTH)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE BOTH)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE BOTH)
#
# --- Confirmation Message ---
# Note: CMAKE_C_COMPILER_VERSION and CMAKE_CXX_COMPILER_VERSION are populated
# the needed compiler flags are defined by the 'build_utils.cmake'
# *after* the 'project() command and language enablement', so they won't be available here.
#
# Set a cached variable to indicate this toolchain is used
set(ROCM_CLANG_TOOLCHAIN_USED TRUE CACHE BOOL "Indicates that the ROCm 'Lightning Clang++' toolchain is in use")
set(IS_LIGHTNING_CLANG_DEFAULT_COMPILER TRUE CACHE BOOL "build_utils.cmake: Indicates that 'Lightning Clang++' is the default compiler")
set(CMAKE_C_COMPILER "${CMAKE_C_COMPILER}" CACHE PATH "C compiler")
set(CMAKE_CXX_COMPILER "${CMAKE_CXX_COMPILER}" CACHE PATH "C++ compiler")
message(STATUS ">> Using ROCm 'Lightning Clang++' Toolchain: ${CMAKE_CURRENT_LIST_FILE}")
message(STATUS " >> C Compiler: ${CMAKE_C_COMPILER}")
message(STATUS " >> C++ Compiler: ${CMAKE_CXX_COMPILER}")
/*
MIT License
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
/*
CMake auto-generated file: Do not edit it.
*/
#define TRANSFERBENCH_CLIENT_VERSION "@TRANSFERBENCH_CLIENT_TARGET_VERSION@"
/*
MIT License
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
/*
CMake auto-generated file: Do not edit it.
*/
#define TRANSFERBENCH_GIT_BRANCH "@GIT_BRANCH@"
#define TRANSFERBENCH_GIT_COMMIT "@GIT_COMMIT_HASH_LONG@"
#define TRANSFERBENCH_HEADER_VERSION "@TRANSFERBENCH_HEADER_VERSION@"
if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to the ROCm installation.")
set(rocm_bin "$ENV{ROCM_PATH}/bin")
else()
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
set(rocm_bin "/opt/rocm/bin")
endif()
if (NOT DEFINED ENV{CXX})
if(EXISTS "${rocm_bin}/amdclang++")
set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
else()
if(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++")
set(rocm_bin "${ROCM_PATH}/llvm/bin")
set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++")
set(rocm_bin "${ROCM_PATH}/llvm/bin")
set(CMAKE_CXX_COMPILER "${rocm_bin}/clang++" CACHE PATH "Path to the C++ compiler")
endif()
endif()
else()
set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
endif()
if (NOT DEFINED ENV{CXXFLAGS})
set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
set(CMAKE_CXX_FLAGS_RELEASE "-O3")
endif()
if(NOT CMAKE_BUILD_TYPE)
message(STATUS "Setting build type to 'Release' as none was specified.")
set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
endif()
# MIT License
#
# Copyright (c) 2023-25 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
#
#
# --- Header-Only vs Regular Compiled Library ---
#
## Header-Only Library:
# A header-only library is a library that consists entirely of header files (.h or .hpp)
# - It does not require separate compilation into binary files (.lib, .a, .so, etc)
#
# - Simplicity and Ease of Distribution:
# - No need for separate compilation or linking steps
# - Just include the headers
# - Great for header-only utilities or template-heavy libraries
#
# - Heavy Use of Templates or Inline Functions:
# - Templates must be defined in headers, so template libraries are often header-only (like, Eigen, Catch2)
# - Inline functions benefit from this as well for potential compiler optimizations
#
# - Small to Medium Size Libraries:
# - Ideal when the codebase is not too large, avoiding long compile times
#
# - Performance-Critical Components:
# - Enables the compiler to inline aggressively across translation units
#
# - Cross-Platform or Header-Only Dependencies:
# - Avoids needing to build for multiple platforms or compilers
#
## Regular Compiled Library:
# A regular compiled library is a library that is compiled into binary files (.lib, .a, .so, etc)
# - It requires separate compilation and linking steps
#
# - Large Codebase / Long Compile Times
# - Avoid recompiling all code that includes the library headers
#
# - Improved Encapsulation
# - Hides implementation details, reduces header bloat, and maintains a clean API
# - Binary distribution keeps proprietary code hidden
#
# - ABI Stability & Compatibility
# - Enables decoupling user code from library internals
# - Users don’t need to recompile their code when internals of the library change (if ABI remains stable)
#
# - Reduced Binary Size:
# - Prevents code bloat due to duplication in each translation unit
#
# - Dynamic Loading / Plugin Systems
# - Necessary if you want runtime dynamic linking (like, via dlopen)
#
# - Separate Build and Test Pipelines
# - Easier to build and test the library independently from the application
#
## Hybrid Approach:
# - Public API in headers, and compiled internals:
# - Templates or inline functions stay in headers
# - Logic-heavy or stable parts go into .so or .a files
#
#
cmake_minimum_required(VERSION 3.25 FATAL_ERROR)
project(${AMD_PROJECT_LIBRARY_NAME}
VERSION ${PROJECT_TARGET_VERSION_TEXT}
DESCRIPTION "TransferBench Engine Library"
LANGUAGES CXX HIP
)
# Load CMake modules
#==================================================================================================
set(AMD_PROJECT_CMAKE_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake)
set(AMD_PROJECT_CMAKE_MODULES_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake/modules)
list(APPEND CMAKE_MODULE_PATH "${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}")
# CMake Toolchain file to define compilers and path to ROCm
#==================================================================================================
if (NOT CMAKE_TOOLCHAIN_FILE)
set(CMAKE_TOOLCHAIN_FILE "${AMD_PROJECT_CMAKE_DIRECTORY}/rocm_clang_toolchain.cmake")
message(STATUS ">> CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif()
#
include(CheckIncludeFiles)
include(CheckSymbolExists)
include(${AMD_PROJECT_CMAKE_DIRECTORY}/build_utils.cmake) # setup_default_compiler_flags
include(${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}/Dependencies.cmake) # rocm-cmake, rocm_local_targets
#
set (TRANSFERBENCH_CLIENT_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/client)
set (TRANSFERBENCH_TBENGINE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
set (TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY ${TRANSFERBENCH_TBENGINE_DIRECTORY}/include)
set (TRANSFERBENCH_TBENGINE_SRC_DIRECTORY ${TRANSFERBENCH_TBENGINE_DIRECTORY}/src)
#
# Default GPU architectures to build
set(BUILD_TRANSFERBENCH_DEFAULT_GPUS_LIST
gfx906
gfx908
gfx90a
gfx942
gfx950
gfx1030
gfx1100
gfx1101
gfx1102
gfx1150
gfx1151
gfx1200
gfx1201
)
#
# Build only for local GPU architecture
set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF)
if (TRANSFERBENCH_LOCAL_GPU_TARGET_ONLY)
message(STATUS " >> Building only for local GPU target")
if (COMMAND rocm_local_targets)
rocm_local_targets(BUILD_TRANSFERBENCH_DEFAULT_GPUS_LIST)
else()
message(WARNING " >> Unable to determine local GPU targets. Falling back to default GPUs.")
endif()
endif()
#
# Determine which GPU architectures to build for
set(TRANSFERBENCH_GPU_TARGETS "${BUILD_TRANSFERBENCH_DEFAULT_GPUS_LIST}" CACHE STRING "Target default GPUs if TRANSFERBENCH_GPU_TARGETS is not defined.")
#
# Check if clang compiler can offload to GPU_TARGETS
if (COMMAND rocm_check_target_ids)
message(STATUS ">> Checking for ROCm support for GPU targets: " "${TRANSFERBENCH_GPU_TARGETS}")
rocm_check_target_ids(TRANSFERBENCH_SUPPORTED_GPUS TARGETS ${TRANSFERBENCH_GPU_TARGETS})
else()
message(WARNING ">> Unable to check for supported GPU targets. Falling back to default GPUs.")
set(TRANSFERBENCH_SUPPORTED_GPUS ${BUILD_TRANSFERBENCH_DEFAULT_GPUS_LIST})
endif()
set(TRANSFERBENCH_COMPILING_TARGETS "${TRANSFERBENCH_SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.")
message(STATUS ">> Building for: ${TRANSFERBENCH_COMPILING_TARGETS}")
foreach(target ${TRANSFERBENCH_COMPILING_TARGETS})
list(APPEND STATIC_LINK_FLAGS --offload-arch=${target})
endforeach()
list(JOIN STATIC_LINK_FLAGS " " FLAGS_STR)
#
# NOTE: Reload rocm-cmake in order to update GPU_TARGETS
# Reloading to use desired GPU_TARGETS instead of defaults
include(${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}/Dependencies.cmake)
#
get_rocm_install_path(ROCM_PATH)
#
# Set CMAKE flags
if (NOT DEFINED CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
endif()
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
# --- HIP Package ---
# Check for HIP
#
# Add ROCM_BASE_PATH to CMake search paths for finding HIP / HSA
list(APPEND CMAKE_PREFIX_PATH
${ROCM_PATH}
${ROCM_PATH}/llvm
${ROCM_PATH}/hip
${ROCM_PATH}/hsa
/opt/rocm
/opt/rocm/llvm
/opt/rocm/hip
/opt/rocm/hsa
)
#
# Check for HIP
find_package(hip REQUIRED CONFIG PATHS ${CMAKE_PREFIX_PATH})
developer_status_message("DEVEL" " >> HIP Include Dirs: ${hip_INCLUDE_DIRS} ...")
developer_status_message("DEVEL" " >> HIP Libraries: ${hip_LIBRARIES} ...")
#
# Ensuring that CXX compiler meets expectations
if(NOT (("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc") OR ("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")))
message(FATAL_ERROR ">> On ROCm platform CMAKE_CXX_COMPILER must be 'hipcc' or 'HIP-aware Clang'.")
endif()
#
# Check for Threads
find_package(Threads REQUIRED)
set(THREADS_PREFER_PTHREAD_FLAG ON)
#
# Check for numa support
set(WAS_NUMA_FOUND OFF)
set(NUMA_LIBRARY_NAME "numa")
find_library(NUMA_LIBRARY ${NUMA_LIBRARY_NAME})
find_path(NUMA_INCLUDE_DIR numa.h)
if(NUMA_LIBRARY AND NUMA_INCLUDE_DIR)
set(WAS_NUMA_FOUND ON)
add_library(${NUMA_LIBRARY_NAME} SHARED IMPORTED)
set_target_properties(${NUMA_LIBRARY_NAME}
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}"
IMPORTED_LOCATION "${NUMA_LIBRARY}"
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}"
)
endif()
developer_status_message("DEVEL" "NUMA_INCLUDE_DIR: ${NUMA_INCLUDE_DIR} ...")
developer_status_message("DEVEL" "NUMA_LIBRARY_NAME: ${NUMA_LIBRARY_NAME} ...")
developer_status_message("DEVEL" "NUMA_LIBRARY: ${NUMA_LIBRARY} ...")
#
# Check for hsa support: 'libamdhip64.so' (libhsa-runtime-dev package)
# ${ROCM_PATH}/include/hsa/hsa.h
find_path(HIP_ROOT_DIR
NAMES
"include/hip/hip_runtime.h"
HINTS
${ROCM_PATH}
/opt/rocm/
)
if(NOT HIP_ROOT_DIR)
message(FATAL_ERROR ">> HIP_ROOT_DIR 'hip_runtime.h' not found. Ensure ROCm is properly set up ...")
endif()
set(HIP_INCLUDE_ROOT_DIR "${HIP_ROOT_DIR}/include")
set(HIP_LIBRARY_ROOT_DIR "${HIP_ROOT_DIR}/lib")
developer_status_message("DEVEL" "HIP_ROOT_DIR: ${HIP_ROOT_DIR} ...")
developer_status_message("DEVEL" "HIP_INCLUDE_ROOT_DIR: ${HIP_INCLUDE_ROOT_DIR} ...")
developer_status_message("DEVEL" "HIP_LIBRARY_ROOT_DIR: ${HIP_LIBRARY_ROOT_DIR} ...")
set(WAS_HSA_FOUND OFF)
set(HSA_LIBRARY_NAME "hsa-runtime64")
find_library(HSA_LIBRARY ${HSA_LIBRARY_NAME} PATHS ${HIP_LIBRARY_ROOT_DIR} ${ROCM_PATH})
find_path(HSA_INCLUDE_DIR "hsa/hsa.h" PATHS ${HIP_INCLUDE_ROOT_DIR} NO_DEFAULT_PATH)
if(HSA_LIBRARY AND HSA_INCLUDE_DIR)
set(WAS_HSA_FOUND ON)
add_library(${HSA_LIBRARY_NAME} SHARED IMPORTED)
set_target_properties(${HSA_LIBRARY_NAME}
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}"
IMPORTED_LOCATION "${HSA_LIBRARY}"
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}"
)
endif()
developer_status_message("DEVEL" "HSA_INCLUDE_DIR: ${HSA_INCLUDE_DIR} ...")
developer_status_message("DEVEL" "HSA_LIBRARY_NAME: ${HSA_LIBRARY_NAME} ...")
developer_status_message("DEVEL" "HSA_LIBRARY: ${HSA_LIBRARY} ...")
#
# Check for hip support: 'libamdhip64.so' (libamdhip64-dev package)
# HIP_LIBRARY will be set to: "hip::host;hip::device" by find_library())
# ${ROCM_PATH}/include/hip/hip_ext.h
set(WAS_HIP_FOUND OFF)
set(HIP_LIBRARY_NAME "amdhip64")
find_library(HIP_LIBRARY ${HIP_LIBRARY_NAME} PATHS ${HIP_LIBRARY_ROOT_DIR} ${ROCM_PATH})
find_path(HIP_INCLUDE_DIR "hip/hip_ext.h" PATHS ${HIP_INCLUDE_ROOT_DIR} NO_DEFAULT_PATH)
if(NOT HIP_INCLUDE_DIR)
message(FATAL_ERROR ">> HIP_INCLUDE_DIR 'hip_ext.h' not found. Ensure ROCm is properly set up ...")
endif()
if(HIP_LIBRARY AND HIP_INCLUDE_DIR)
set(HIP_LIBRARY "${HIP_LIBRARY_ROOT_DIR}/libamdhip64.so")
set(WAS_HIP_FOUND ON)
add_library(${HIP_LIBRARY_NAME} SHARED IMPORTED)
set_target_properties(${HIP_LIBRARY_NAME}
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${HIP_INCLUDE_DIR}"
IMPORTED_LOCATION "${HIP_LIBRARY}"
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_INCLUDE_DIR}"
)
endif()
developer_status_message("DEVEL" "HIP_INCLUDE_DIR: ${HIP_INCLUDE_DIR} ...")
developer_status_message("DEVEL" "HIP_LIBRARY_NAME: ${HIP_LIBRARY_NAME} ...")
developer_status_message("DEVEL" "HIP_LIBRARY: ${HIP_LIBRARY} ...")
#
# Library/interface names
set(AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_header")
set(TRANSFERBENCH_INTERFACE_TARGET_NAME "${AMD_PROJECT_PACKAGE_NAME}_engine")
set(TRANSFERBENCH_INTERFACE_TARGET_NAME_ALIAS "${AMD_PROJECT_PACKAGE_NAME}::engine")
set(AMD_PROJECT_STATIC_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_static")
set(AMD_PROJECT_SHARED_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_shared")
set(AMD_PROJECT_OBJECT_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_object_library")
set(AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_hip_object_library")
set(AMD_PROJECT_CLIENT_NAME "${AMD_PROJECT_NAME}")
#
# Check for infiniband verbs support
set(WAS_IBVERBS_FOUND OFF)
if(DEFINED ENV{DISABLE_NIC_EXEC} AND "$ENV{DISABLE_NIC_EXEC}" STREQUAL "1")
message(STATUS ">> Disabling 'NIC Executor' support. 'DISABLE_NIC_EXEC' was enabled ...")
elseif(NOT TRANSFERBENCH_ENABLE_NIC_EXEC)
message(STATUS ">> For CMake builds, NIC executor requires explicit opt-in by setting CMake flag '-DTRANSFERBENCH_ENABLE_NIC_EXEC=1|ON' ...")
message(STATUS ">> Disabling 'NIC Executor' support ...")
else()
set(IBVERBS_LIBRARY_NAME "ibverbs")
find_library(IBVERBS_LIBRARY ${IBVERBS_LIBRARY_NAME})
find_path(IBVERBS_INCLUDE_DIR "infiniband/verbs.h")
if(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
set(WAS_IBVERBS_FOUND ON)
add_library(${IBVERBS_LIBRARY_NAME} SHARED IMPORTED)
set_target_properties(${IBVERBS_LIBRARY_NAME}
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}"
IMPORTED_LOCATION "${IBVERBS_LIBRARY}"
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}"
)
message(STATUS ">> Building with 'NIC executor' support. Can set 'DISABLE_NIC_EXEC=1' to disable")
else()
if (NOT IBVERBS_LIBRARY)
message(WARNING ">> 'IBVerbs' library not found ...")
endif()
if (NOT IBVERBS_INCLUDE_DIR)
message(WARNING ">> 'infiniband/verbs.h' not found ...")
endif()
message(WARNING "Building without 'NIC executor' support. To use the TransferBench RDMA executor, \n"
" check if your system has NICs, the NIC drivers are installed, and 'libibverbs-dev' is installed")
endif()
endif()
# --- Get TB commit and branch ---
# That's useful for tracking which version of the code was used to build the library
if(DEFINED TRANSFERBENCH_COMMIT_HASH_LONG AND DEFINED TRANSFERBENCH_COMMIT_BRANCH)
set(GIT_COMMIT_HASH_LONG "${TRANSFERBENCH_COMMIT_HASH_LONG}")
set(GIT_BRANCH "${TRANSFERBENCH_COMMIT_BRANCH}")
else()
# Get info about the current branch
execute_process(
COMMAND git rev-parse --abbrev-ref HEAD
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_BRANCH
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT_BRANCH
ERROR_QUIET
)
# Get hash log info for the current branch
execute_process(
COMMAND git log -1 --format=%H
WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_COMMIT_HASH_LONG
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RESULT_HASH_LONG
ERROR_QUIET
)
endif()
if(GIT_COMMIT_HASH_LONG STREQUAL "" OR GIT_BRANCH STREQUAL "")
message(WARNING "[[ No commit hash/branch were found. ]]")
else()
set(TRANSFERBENCH_HEADER_VERSION ${PROJECT_TARGET_VERSION_TEXT})
developer_status_message("DEVEL" ">> Setting TransferBench commit/branch info in '${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}/TransferBench.hpp' ...")
developer_status_message("DEVEL" " >> GIT_BRANCH=\"${GIT_BRANCH}\"")
developer_status_message("DEVEL" " >> GIT_COMMIT_HASH_LONG=\"${GIT_COMMIT_HASH_LONG}\"")
developer_status_message("DEVEL" " >> TRANSFERBENCH_HEADER_VERSION=\"${TRANSFERBENCH_HEADER_VERSION}\"")
file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/include/)
configure_file(
${AMD_PROJECT_CMAKE_DIRECTORY}/tbengine_version.hpp.in
${CMAKE_BINARY_DIR}/include/tbengine_version.hpp
@ONLY
)
endif()
# --- End of Get TB commit and branch ---
#
# Header/Source files
set(TRANSFERBENCH_ENGINE_HEADER_SOURCES
${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}/TransferBench.hpp
)
set(TRANSFERBENCH_ENGINE_HEADER_IMPL_SOURCES
${TRANSFERBENCH_TBENGINE_SRC_DIRECTORY}/TransferBench.cpp
)
set(TRANSFERBENCH_ENGINE_ALL_SOURCES
${TRANSFERBENCH_ENGINE_HEADER_SOURCES}
${TRANSFERBENCH_ENGINE_HEADER_IMPL_SOURCES}
)
#
# --- Object Libraries, and avoid recompilation ---
# Common public interface target
add_library(${TRANSFERBENCH_INTERFACE_TARGET_NAME} INTERFACE)
add_library(${TRANSFERBENCH_INTERFACE_TARGET_NAME_ALIAS} ALIAS ${TRANSFERBENCH_INTERFACE_TARGET_NAME})
target_include_directories(${TRANSFERBENCH_INTERFACE_TARGET_NAME}
INTERFACE
$<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
$<INSTALL_INTERFACE:include>
)
#
# For dynamic linking: HIP object library (for use in other targets)
add_library(${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}
OBJECT
${TRANSFERBENCH_ENGINE_HEADER_IMPL_SOURCES}
)
set_target_properties(${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}
PROPERTIES
POSITION_INDEPENDENT_CODE ON
)
target_include_directories(${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}
PUBLIC
$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include/>
$<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
$<INSTALL_INTERFACE:include>
)
setup_default_compiler_flags(${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME})
#
# For static linking: Standard object library (for use in other targets)
add_library(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
OBJECT
${TRANSFERBENCH_ENGINE_HEADER_IMPL_SOURCES}
)
set_target_properties(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
PROPERTIES
POSITION_INDEPENDENT_CODE ON
)
target_include_directories(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
PUBLIC
$<BUILD_INTERFACE:${NUMA_INCLUDE_DIR}>
$<BUILD_INTERFACE:${HIP_INCLUDE_ROOT_DIR}>
$<BUILD_INTERFACE:${HIP_INCLUDE_DIR}>
$<BUILD_INTERFACE:${HSA_INCLUDE_DIR}>
$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include/>
$<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
$<INSTALL_INTERFACE:include>
)
setup_default_compiler_flags(${AMD_PROJECT_OBJECT_LIBRARY_NAME})
target_link_libraries(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
PUBLIC
${NUMA_LIBRARY}
${HIP_LIBRARY}
${HSA_LIBRARY}
hip::host
Threads::Threads
dl
)
if(WAS_IBVERBS_FOUND)
target_include_directories(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
PRIVATE
$<BUILD_INTERFACE:${IBVERBS_INCLUDE_DIR}>
)
target_link_libraries(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
PRIVATE
${IBVERBS_LIBRARY}
)
target_compile_definitions(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
PRIVATE
NIC_EXEC_ENABLED
)
endif()
set_target_properties(${AMD_PROJECT_OBJECT_LIBRARY_NAME}
PROPERTIES
POSITION_INDEPENDENT_CODE ON
EXCLUDE_FROM_ALL ON
EXCLUDE_FROM_DEFAULT_BUILD ON
)
# ---
#
# --- Different build types ---
if(TRANSFERBENCH_ENGINE_SHARED)
message(STATUS ">> Building TransferBench 'shared' library ...")
add_library(${AMD_PROJECT_SHARED_LIBRARY_NAME} SHARED
$<TARGET_OBJECTS:${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}>
)
developer_status_message("DEVEL" " >> PROJECT_TARGET_BINARY_VERSION: '${PROJECT_TARGET_BINARY_VERSION}' ")
developer_status_message("DEVEL" " >> PROJECT_TARGET_VERSION_TEXT: '${PROJECT_TARGET_VERSION_TEXT}' ")
developer_status_message("DEVEL" " >> PROJECT_TARGET_VERSION: '${PROJECT_TARGET_VERSION}' ")
developer_status_message("DEVEL" " >> PROJECT_MAJOR.MINOR_VERSION: '${AMD_PROJECT_VERSION_MAJOR}'.'${AMD_PROJECT_VERSION_MINOR}' ")
set_target_properties(${AMD_PROJECT_SHARED_LIBRARY_NAME}
PROPERTIES
OUTPUT_NAME ${AMD_PROJECT_LIBRARY_NAME}
VERSION ${PROJECT_TARGET_VERSION}
SOVERSION ${AMD_PROJECT_VERSION_MAJOR}
LINKER_LANGUAGE CXX
CUDA_RESOLVE_DEVICE_SYMBOLS ON
POSITION_INDEPENDENT_CODE ON
)
target_include_directories(${AMD_PROJECT_SHARED_LIBRARY_NAME}
PUBLIC
$<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
$<INSTALL_INTERFACE:include>
)
target_link_libraries(${AMD_PROJECT_SHARED_LIBRARY_NAME}
PUBLIC
${NUMA_LIBRARY}
${HSA_LIBRARY}
Threads::Threads
dl
PRIVATE
hip::device
)
target_compile_definitions(${AMD_PROJECT_SHARED_LIBRARY_NAME}
PUBLIC
TRANSFERBENCH_SHARED
)
# Shared library specific compile options
setup_default_compiler_flags(${AMD_PROJECT_SHARED_LIBRARY_NAME})
add_common_flag("-fgpu-rdc" ${AMD_PROJECT_SHARED_LIBRARY_NAME})
# Install shared library
install(TARGETS ${AMD_PROJECT_SHARED_LIBRARY_NAME}
EXPORT ${AMD_PROJECT_NAME}Targets
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
)
endif()
if(TRANSFERBENCH_ENGINE_STATIC)
message(STATUS ">> Building TransferBench 'static' library ...")
add_library(${AMD_PROJECT_STATIC_LIBRARY_NAME} STATIC
$<TARGET_OBJECTS:${AMD_PROJECT_OBJECT_LIBRARY_NAME}>
)
set_target_properties(${AMD_PROJECT_STATIC_LIBRARY_NAME}
PROPERTIES
OUTPUT_NAME ${AMD_PROJECT_LIBRARY_NAME}
POSITION_INDEPENDENT_CODE ON
)
target_include_directories(${AMD_PROJECT_STATIC_LIBRARY_NAME}
PUBLIC
$<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
$<INSTALL_INTERFACE:include>
)
target_link_libraries(${AMD_PROJECT_STATIC_LIBRARY_NAME}
PUBLIC
${NUMA_LIBRARY}
${HSA_LIBRARY}
Threads::Threads
hip::host
dl
)
target_compile_definitions(${AMD_PROJECT_STATIC_LIBRARY_NAME}
PUBLIC
TRANSFERBENCH_STATIC
)
# Static library specific compile options
setup_default_compiler_flags(${AMD_PROJECT_STATIC_LIBRARY_NAME})
add_common_flag("-fgpu-rdc" ${AMD_PROJECT_STATIC_LIBRARY_NAME})
# Install static library
install(TARGETS ${AMD_PROJECT_STATIC_LIBRARY_NAME}
EXPORT ${AMD_PROJECT_NAME}Targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif()
if(TRANSFERBENCH_ENGINE_HEADER_ONLY)
message(STATUS ">> Building TransferBench 'header-only' library ...")
add_library(${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME} INTERFACE)
set_target_properties(${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME}
PROPERTIES
INTERFACE_CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
target_include_directories(${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME}
INTERFACE
$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include>
$<BUILD_INTERFACE:${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}>
$<INSTALL_INTERFACE:include>
)
target_link_libraries(${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME}
INTERFACE
${AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME}
${NUMA_LIBRARY}
${HIP_LIBRARY}
${HSA_LIBRARY}
hip::device
Threads::Threads
dl
)
# Install header-only library
install(
DIRECTORY ${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}
DESTINATION include
FILES_MATCHING
PATTERN "*.hpp"
)
endif()
#
# Common install
install(
DIRECTORY ${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
if(NOT TRANSFERBENCH_ENGINE_HEADER_ONLY AND TRANSFERBENCH_ENGINE_STATIC OR TRANSFERBENCH_ENGINE_SHARED)
install(EXPORT ${AMD_PROJECT_NAME}Targets
FILE ${AMD_PROJECT_NAME}Targets.cmake
NAMESPACE "${AMD_PROJECT_PACKAGE_NAME}::"
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${AMD_PROJECT_PACKAGE_NAME}
)
endif()
## End of CMakeLists.txt
/*
MIT License
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/// @cond
#pragma once
#include <tbengine_version.hpp>
#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
#include <numaif.h>
#include <stdarg.h>
#include <unistd.h>
#include <algorithm>
#include <cstring>
#include <future>
#include <map>
#include <random>
#include <set>
#include <sstream>
#include <string>
#include <string_view>
#include <thread>
#include <vector>
#ifdef NIC_EXEC_ENABLED
#include <arpa/inet.h>
#include <fcntl.h>
#include <infiniband/verbs.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <filesystem>
#include <fstream>
#endif
#if defined(__NVCC__)
#include <cuda_runtime.h>
#else
#include <hip/hip_ext.h>
#include <hip/hip_runtime.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
#endif
/// @endcond
/*
* Note: If for any reason, we have something that is header-only implementation, it can be added
* here.
*/
#if defined(TRANSFERBENCH_HEADER_IMPLEMENTATION_DETAILS)
#endif //-- TRANSFERBENCH_HEADER_IMPLEMENTATION_DETAILS
namespace TransferBench
{
using std::map;
using std::pair;
using std::set;
using std::vector;
// constexpr char VERSION[] = "1.64";
static const auto TB_GIT_BRANCH = std::string_view(TRANSFERBENCH_GIT_BRANCH);
static const auto TB_GIT_COMMIT = std::string_view(TRANSFERBENCH_GIT_COMMIT);
static constexpr auto TB_HEADER_VERSION = std::string_view(TRANSFERBENCH_HEADER_VERSION);
static constexpr auto TB_UNKNOWN_VERSION = std::string_view("Unknown");
/**
* Enumeration of supported Executor types
*
* @note The Executor is the device used to perform a Transfer
*/
enum ExeType
{
EXE_CPU = 0, ///< CPU executor (subExecutor = CPU thread)
EXE_GPU_GFX = 1, ///< GPU kernel-based executor (subExecutor = threadblock/CU)
EXE_GPU_DMA = 2, ///< GPU SDMA executor (subExecutor = not supported)
EXE_NIC = 3, ///< NIC RDMA executor (subExecutor = queue pair)
EXE_NIC_NEAREST = 4 ///< NIC RDMA nearest executor (subExecutor = queue pair)
};
char const ExeTypeStr[6] = "CGDIN";
inline bool IsCpuExeType(ExeType e) { return e == EXE_CPU; }
inline bool IsGpuExeType(ExeType e) { return e == EXE_GPU_GFX || e == EXE_GPU_DMA; }
inline bool IsNicExeType(ExeType e) { return e == EXE_NIC || e == EXE_NIC_NEAREST; }
/**
* A ExeDevice defines a specific Executor
*/
struct ExeDevice
{
ExeType exeType; ///< Executor type
int32_t exeIndex; ///< Executor index
bool operator<(ExeDevice const& other) const
{
return (exeType < other.exeType) ||
(exeType == other.exeType && exeIndex < other.exeIndex);
}
};
/**
* Enumeration of supported memory types
*
* @note These are possible types of memory to be used as sources/destinations
*/
enum MemType
{
MEM_CPU = 0, ///< Coarse-grained pinned CPU memory
MEM_GPU = 1, ///< Coarse-grained global GPU memory
MEM_CPU_FINE = 2, ///< Fine-grained pinned CPU memory
MEM_GPU_FINE = 3, ///< Fine-grained global GPU memory
MEM_CPU_UNPINNED = 4, ///< Unpinned CPU memory
MEM_NULL = 5, ///< NULL memory - used for empty
MEM_MANAGED = 6, ///< Managed memory
MEM_CPU_CLOSEST = 7, ///< Coarse-grained pinned CPU memory indexed by closest GPU
};
char const MemTypeStr[9] = "CGBFUNMP";
inline bool IsCpuMemType(MemType m)
{
return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED || m == MEM_CPU_CLOSEST);
}
inline bool IsGpuMemType(MemType m)
{
return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED);
}
/**
* A MemDevice indicates a memory type on a specific device
*/
struct MemDevice
{
MemType memType; ///< Memory type
int32_t memIndex; ///< Device index
bool operator<(MemDevice const& other) const
{
return (memType < other.memType) ||
(memType == other.memType && memIndex < other.memIndex);
}
};
/**
* A Transfer adds together data from zero or more sources then writes the sum to zero or more
* destinations
*/
struct Transfer
{
size_t numBytes = 0; ///< Number of bytes to Transfer
vector<MemDevice> srcs = {}; ///< List of source memory devices
vector<MemDevice> dsts = {}; ///< List of destination memory devices
ExeDevice exeDevice = {}; ///< Executor to use
int32_t exeSubIndex = -1; ///< Executor subindex
int numSubExecs = 0; ///< Number of subExecutors to use for this Transfer
};
/**
* General options
*/
struct GeneralOptions
{
int numIterations = 10; ///< Number of timed iterations to perform. If negative, run for
///< -numIterations seconds instead
int numSubIterations = 1; ///< Number of sub-iterations per iteration
int numWarmups = 3; ///< Number of un-timed warmup iterations to perform
int recordPerIteration = 0; ///< Record per-iteration timing information
int useInteractive = 0; ///< Pause for user-input before starting transfer loop
};
/**
* Data options
*/
struct DataOptions
{
int alwaysValidate = 0; ///< Validate after each iteration instead of once at end
int blockBytes = 256; ///< Each subexecutor works on a multiple of this many bytes
int byteOffset = 0; ///< Byte-offset for memory allocations
vector<float> fillPattern = {}; ///< Pattern of floats used to fill source data
///< Customized data patterns (overrides fillPattern if non-empty)
vector<int> fillCompress = {};
int validateDirect = 0; ///< Validate GPU results directly instead of copying to host
int validateSource = 0; ///< Validate src GPU memory immediately after preparation
};
/**
* GFX Executor options
*/
struct GfxOptions
{
int blockOrder = 0; ///< Determines how threadblocks are ordered (0=sequential,
///< 1=interleaved, 2=random)
int blockSize = 256; ///< Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask = {}; ///< Bit-vector representing the CU mask
///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
vector<vector<int>> prefXccTable = {};
int temporalMode = 0; ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
int unrollFactor = 4; ///< GFX-kernel unroll factor
int useHipEvents = 1; ///< Use HIP events for timing GFX Executor
int useMultiStream = 0; ///< Use multiple streams for GFX
int useSingleTeam = 0; ///< Team all subExecutors across the data array
int waveOrder = 0; ///< GFX-kernel wavefront ordering
int wordSize = 4; ///< GFX-kernel packed data size (4=dwordx4, 2=dwordx2, 1=dwordx1)
};
/**
* DMA Executor options
*/
struct DmaOptions
{
int useHipEvents = 1; ///< Use HIP events for timing DMA Executor
int useHsaCopy = 0; ///< Use HSA copy instead of HIP copy to perform DMA
};
/**
* NIC Executor options
*/
struct NicOptions
{
vector<int> closestNics = {}; ///< Overrides the auto-detected closest NIC per GPU
int ibGidIndex = -1; ///< GID Index for RoCE NICs (-1 is auto)
uint8_t ibPort = 1; ///< NIC port number to be used
int ipAddressFamily = 4; ///< 4=IPv4, 6=IPv6 (used for auto GID detection)
int maxRecvWorkReq = 16; ///< Maximum number of recv work requests per queue pair
int maxSendWorkReq = 16; ///< Maximum number of send work requests per queue pair
int queueSize = 100; ///< Completion queue size
int roceVersion = 2; ///< RoCE version (used for auto GID detection)
int useRelaxedOrder = 1; ///< Use relaxed ordering
int useNuma = 0; ///< Switch to closest numa thread for execution
};
/**
* Configuration options for performing Transfers
*/
struct ConfigOptions
{
GeneralOptions general; ///< General options
DataOptions data; ///< Data options
GfxOptions gfx; ///< GFX executor options
DmaOptions dma; ///< DMA executor options
NicOptions nic; ///< NIC executor options
};
/**
* Enumeration of possible error types
*/
enum ErrType
{
ERR_NONE = 0, ///< No errors
ERR_WARN = 1, ///< Warning - results may not be accurate
ERR_FATAL = 2, ///< Fatal error - results are invalid
};
/**
* Enumeration of GID priority
*
* @note These are the GID types ordered in priority from lowest (0) to highest
*/
enum GidPriority
{
UNKNOWN = -1, ///< Default
ROCEV1_LINK_LOCAL = 0, ///< RoCEv1 Link-local
ROCEV2_LINK_LOCAL = 1, ///< RoCEv2 Link-local fe80::/10
ROCEV1_IPV6 = 2, ///< RoCEv1 IPv6
ROCEV2_IPV6 = 3, ///< RoCEv2 IPv6
ROCEV1_IPV4 = 4, ///< RoCEv1 IPv4-mapped IPv6
ROCEV2_IPV4 = 5, ///< RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x
};
const char* GidPriorityStr[] = {"RoCEv1 Link-local",
"RoCEv2 Link-local",
"RoCEv1 IPv6",
"RoCEv2 IPv6",
"RoCEv1 IPv4-mapped IPv6",
"RoCEv2 IPv4-mapped IPv6"};
/**
* ErrResult consists of error type and error message
*/
struct ErrResult
{
ErrType errType; ///< Error type
std::string errMsg; ///< Error details
ErrResult() = default;
// clang-format off
#if defined(__NVCC__)
ErrResult(cudaError_t err);
#else
ErrResult(hipError_t err);
ErrResult(hsa_status_t err);
#endif
ErrResult(ErrType err);
ErrResult(ErrType errType, const char* format, ...);
// clang-format on
};
/**
* Results for a single Executor
*/
struct ExeResult
{
size_t numBytes; ///< Total bytes transferred by this Executor
double avgDurationMsec; ///< Averaged duration for all the Transfers for this Executor
double avgBandwidthGbPerSec; ///< Average bandwidth for this Executor
double sumBandwidthGbPerSec; ///< Naive sum of individual Transfer average bandwidths
vector<int> transferIdx; ///< Indicies of Transfers this Executor executed
};
/**
* Results for a single Transfer
*/
struct TransferResult
{
size_t numBytes; ///< Number of bytes transferred by this Transfer
///< Duration for this Transfer, averaged over all timed iterations
double avgDurationMsec;
double avgBandwidthGbPerSec; ///< Bandwidth for this Transfer based on averaged duration
// Only filled in if recordPerIteration = 1
vector<double> perIterMsec; ///< Duration for each individual iteration
vector<set<pair<int, int>>> perIterCUs; ///< GFX-Executor only. XCC:CU used per iteration
ExeDevice exeDevice; ///< Tracks which executor performed this Transfer (e.g. for
///< EXE_NIC_NEAREST)
ExeDevice exeDstDevice; ///< Tracks actual destination executor (only valid for
///< EXE_NIC/EXE_NIC_NEAREST)
};
/**
* TestResults contain timing results for a set of Transfers as a group as well as per Executor and
* per Transfer timing information
*/
struct TestResults
{
int numTimedIterations; ///< Number of iterations executed
size_t totalBytesTransferred; ///< Total bytes transferred per iteration
double avgTotalDurationMsec; ///< Wall-time (msec) to finish all Transfers (averaged
///< across all timed iterations)
double avgTotalBandwidthGbPerSec; ///< Bandwidth based on all Transfers and average wall
///< time
double overheadMsec; ///< Difference between total wall time and slowest executor
map<ExeDevice, ExeResult> exeResults; ///< Per Executor results
vector<TransferResult> tfrResults; ///< Per Transfer results
vector<ErrResult> errResults; ///< List of any errors/warnings that occurred
};
/**
* Run a set of Transfers
*
* @param[in] config Configuration options
* @param[in] transfers Set of Transfers to execute
* @param[out] results Timing results
* @returns true if and only if Transfers were run successfully without any fatal errors
*/
bool RunTransfers(ConfigOptions const& config,
vector<Transfer> const& transfers,
TestResults& results);
/**
* Enumeration of implementation attributes
*/
enum IntAttribute
{
ATR_GFX_MAX_BLOCKSIZE, ///< Maximum blocksize for GFX executor
ATR_GFX_MAX_UNROLL, ///< Maximum unroll factor for GFX executor
};
enum StrAttribute
{
ATR_SRC_PREP_DESCRIPTION ///< Description of how source memory is prepared
};
/**
* Query attributes (integer)
*
* @note This allows querying of implementation information such as limits
*
* @param[in] attribute Attribute to query
* @returns Value of the attribute
*/
int GetIntAttribute(IntAttribute attribute);
/**
* Query attributes (string)
*
* @note This allows query of implementation details such as limits
*
* @param[in] attrtibute Attribute to query
* @returns Value of the attribute
*/
std::string GetStrAttribute(StrAttribute attribute);
/**
* Returns information about number of available available Executors
*
* @param[in] exeType Executor type to query
* @returns Number of detected Executors of exeType
*/
int GetNumExecutors(ExeType exeType);
/**
* Returns the number of possible Executor subindices
*
* @note For CPU, this is 0
* @note For GFX, this refers to the number of XCDs
* @note For DMA, this refers to the number of DMA engines
*
* @param[in] exeDevice The specific Executor to query
* @returns Number of detected executor subindices
*/
int GetNumExecutorSubIndices(ExeDevice exeDevice);
/**
* Returns number of subExecutors for a given ExeDevice
*
* @param[in] exeDevice The specific Executor to query
* @returns Number of detected subExecutors for the given ExePair
*/
int GetNumSubExecutors(ExeDevice exeDevice);
/**
* Returns the index of the NUMA node closest to the given GPU
*
* @param[in] gpuIndex Index of the GPU to query
* @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect
*/
int GetClosestCpuNumaToGpu(int gpuIndex);
/**
* Returns the index of the NUMA node closest to the given NIC
*
* @param[in] nicIndex Index of the NIC to query
* @returns NUMA node index closest to the NIC nicIndex, or -1 if unable to detect
*/
int GetClosestCpuNumaToNic(int nicIndex);
/**
* Returns the index of the NIC closest to the given GPU
*
* @param[in] gpuIndex Index of the GPU to query
* @note This function is applicable when the IBV/RDMA executor is available
* @returns IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect
*/
int GetClosestNicToGpu(int gpuIndex);
/**
* Helper function to parse a line containing Transfers into a vector of Transfers
*
* @param[in] str String containing description of Transfers
* @param[out] transfers List of Transfers described by 'str'
* @returns Information about any error that may have occured
*/
ErrResult ParseTransfers(std::string str, std::vector<Transfer>& transfers);
/**
* Helper function to get the builtin version
*
* @returns string format of tb builtin version
*/
auto GetTransferBenchVersion() -> const std::string;
/**
* Helper function to get branch information
*
* @returns string format of tb branch information
*/
auto GetTransferBenchBranch() -> const std::string;
/**
* Helper function to get the builtin git commit
*
* @returns string format of tb builtin git commit
*/
auto GetTransferBenchCommitHash([[maybe_unused]] bool is_long_commit = true) -> const std::string;
}; // namespace TransferBench
//==========================================================================================
// End of TransferBench API
//==========================================================================================
// Redefinitions for CUDA compatibility
//==========================================================================================
// clang-format off
#if defined(__NVCC__)
// ROCm specific
#define wall_clock64 clock64
#define gcnArchName name
// Datatypes
#define hipDeviceProp_t cudaDeviceProp
#define hipError_t cudaError_t
#define hipEvent_t cudaEvent_t
#define hipStream_t cudaStream_t
// Enumerations
#define hipDeviceAttributeClockRate cudaDevAttrClockRate
#define hipDeviceAttributeMultiprocessorCount cudaDevAttrMultiProcessorCount
#define hipErrorPeerAccessAlreadyEnabled cudaErrorPeerAccessAlreadyEnabled
#define hipFuncCachePreferShared cudaFuncCachePreferShared
#define hipMemcpyDefault cudaMemcpyDefault
#define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define hipMemcpyHostToDevice cudaMemcpyHostToDevice
#define hipSuccess cudaSuccess
// Functions
#define hipDeviceCanAccessPeer cudaDeviceCanAccessPeer
#define hipDeviceEnablePeerAccess cudaDeviceEnablePeerAccess
#define hipDeviceGetAttribute cudaDeviceGetAttribute
#define hipDeviceGetPCIBusId cudaDeviceGetPCIBusId
#define hipDeviceSetCacheConfig cudaDeviceSetCacheConfig
#define hipDeviceSynchronize cudaDeviceSynchronize
#define hipEventCreate cudaEventCreate
#define hipEventDestroy cudaEventDestroy
#define hipEventElapsedTime cudaEventElapsedTime
#define hipEventRecord cudaEventRecord
#define hipFree cudaFree
#define hipGetDeviceCount cudaGetDeviceCount
#define hipGetDeviceProperties cudaGetDeviceProperties
#define hipGetErrorString cudaGetErrorString
#define hipHostFree cudaFreeHost
#define hipHostMalloc cudaMallocHost
#define hipMalloc cudaMalloc
#define hipMallocManaged cudaMallocManaged
#define hipMemcpy cudaMemcpy
#define hipMemcpyAsync cudaMemcpyAsync
#define hipMemset cudaMemset
#define hipMemsetAsync cudaMemsetAsync
#define hipSetDevice cudaSetDevice
#define hipStreamCreate cudaStreamCreate
#define hipStreamDestroy cudaStreamDestroy
#define hipStreamSynchronize cudaStreamSynchronize
// clang-format on
// Define float2 addition operator for NVIDIA platform
__device__ inline float2& operator+=(float2& a, const float2& b)
{
a.x += b.x;
a.y += b.y;
return a;
}
// Define float4 addition operator for NVIDIA platform
__device__ inline float4& operator+=(float4& a, const float4& b)
{
a.x += b.x;
a.y += b.y;
a.z += b.z;
a.w += b.w;
return a;
}
#endif
// Helper macro functions
//==========================================================================================
// Macro for collecting CU/SM GFX kernel is running on
// clang-format off
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1150__) || defined(__gfx1151__) || \
defined(__gfx1200__) || defined(__gfx1201__)
#define GetHwId(hwId) hwId = 0
#elif defined(__NVCC__)
#define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
#else
#define GetHwId(hwId) asm volatile("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s"(hwId));
#endif
// Macro for collecting XCC GFX kernel is running on
#if defined(__gfx942__) || defined(__gfx950__)
#define GetXccId(val) asm volatile("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s"(val));
#else
#define GetXccId(val) val = 0
#endif
// Error check macro (NOTE: This will return even for ERR_WARN)
#define ERR_CHECK(cmd) \
do { \
ErrResult err = (cmd); \
if (err.errType != ERR_NONE) { \
return err; \
} \
} while (0)
// Appends warn/fatal errors to a list, return false if fatal
#define ERR_APPEND(cmd, list) \
do { \
ErrResult err = (cmd); \
if (err.errType != ERR_NONE) { \
list.push_back(err); \
} \
if (err.errType == ERR_FATAL) { \
return false; \
} \
} while (0)
// Helper macros for calling RDMA functions and reporting errors
#ifdef VERBS_DEBUG
#define IBV_CALL(__func__, ...) \
do { \
int error = __func__(__VA_ARGS__); \
if (error != 0) { \
return {ERR_FATAL, \
"Encountered IbVerbs error (%d) at line (%d) " \
"and function (%s)", \
(error), \
__LINE__, \
#__func__}; \
} \
} while (0)
#define IBV_PTR_CALL(__ptr__, __func__, ...) \
do { \
__ptr__ = __func__(__VA_ARGS__); \
if (__ptr__ == nullptr) { \
return {ERR_FATAL, \
"Encountered IbVerbs nullptr error at line (%d) " \
"and function (%s)", \
__LINE__, \
#__func__}; \
} \
} while (0)
#else
#define IBV_CALL(__func__, ...) \
do { \
int error = __func__(__VA_ARGS__); \
if (error != 0) { \
return {ERR_FATAL, "Encountered IbVerbs error (%d) in func (%s) ", error, #__func__}; \
} \
} while (0)
#define IBV_PTR_CALL(__ptr__, __func__, ...) \
do { \
__ptr__ = __func__(__VA_ARGS__); \
if (__ptr__ == nullptr) { \
return {ERR_FATAL, "Encountered IbVerbs nullptr error in func (%s) ", #__func__}; \
} \
} while (0)
#endif
// clang-format on
namespace TransferBench
{
/// @cond
// Helper functions ('hidden' in anonymous namespace)
//========================================================================================
namespace
{
// Constants
//========================================================================================
int constexpr MAX_BLOCKSIZE = 1024; // Max threadblock size
int constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64; // Max wavegroups/warps
int constexpr MAX_UNROLL = 8; // Max unroll factor
int constexpr MAX_SRCS = 8; // Max srcs per Transfer
int constexpr MAX_DSTS = 8; // Max dsts per Transfer
int constexpr MEMSET_CHAR = 75; // Value to memset (char)
float constexpr MEMSET_VAL = 13323083.0f; // Value to memset (double)
// Parsing-related functions
//========================================================================================
static ErrResult CharToMemType(char const c, MemType& memType)
{
char const* val = strchr(MemTypeStr, toupper(c));
if (val) {
memType = (MemType)(val - MemTypeStr);
return ERR_NONE;
}
return {ERR_FATAL, "Unexpected memory type (%c)", c};
}
static ErrResult CharToExeType(char const c, ExeType& exeType)
{
char const* val = strchr(ExeTypeStr, toupper(c));
if (val) {
exeType = (ExeType)(val - ExeTypeStr);
return ERR_NONE;
}
return {ERR_FATAL, "Unexpected executor type (%c)", c};
}
static ErrResult ParseMemType(std::string const& token, std::vector<MemDevice>& memDevices)
{
char memTypeChar;
int offset = 0, memIndex, inc;
MemType memType;
bool found = false;
memDevices.clear();
while (sscanf(token.c_str() + offset, " %c %d%n", &memTypeChar, &memIndex, &inc) == 2) {
offset += inc;
ErrResult err = CharToMemType(memTypeChar, memType);
if (err.errType != ERR_NONE) { return err; }
if (memType != MEM_NULL) { memDevices.push_back({memType, memIndex}); }
found = true;
}
if (found) { return ERR_NONE; }
return {ERR_FATAL,
"Unable to parse memory type token %s. Expected one of %s followed by an index",
token.c_str(),
MemTypeStr};
}
static ErrResult ParseExeType(std::string const& token, ExeDevice& exeDevice, int& exeSubIndex)
{
char exeTypeChar;
exeSubIndex = -1;
int numTokensParsed = sscanf(
token.c_str(), " %c%d.%d", &exeTypeChar, &exeDevice.exeIndex, &exeSubIndex);
if (numTokensParsed < 2) {
return {ERR_FATAL,
"Unable to parse valid executor token (%s)."
"Expected one of %s followed by an index",
token.c_str(),
ExeTypeStr};
}
return CharToExeType(exeTypeChar, exeDevice.exeType);
}
// Memory-related functions
//========================================================================================
// Enable peer access between two GPUs
static ErrResult EnablePeerAccess(int const deviceId, int const peerDeviceId)
{
int canAccess;
ERR_CHECK(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
if (!canAccess) {
return {ERR_FATAL,
"Peer access is unavailable between GPU devices %d to %d."
"For AMD hardware, check IOMMU configuration",
peerDeviceId,
deviceId};
}
ERR_CHECK(hipSetDevice(deviceId));
hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled) {
return {ERR_FATAL,
"Unable to enable peer to peer access from %d to %d (%s)",
deviceId,
peerDeviceId,
hipGetErrorString(error)};
}
return ERR_NONE;
}
// Check that CPU memory array of numBytes has been allocated on targetId NUMA node
static ErrResult CheckPages(char* array, size_t numBytes, int targetId)
{
size_t const pageSize = getpagesize();
size_t const numPages = (numBytes + pageSize - 1) / pageSize;
std::vector<void*> pages(numPages);
std::vector<int> status(numPages);
pages[0] = array;
for (auto i = std::size_t(1); i < numPages; i++) { pages[i] = (char*)pages[i - 1] + pageSize; }
long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
if (retCode) {
return {ERR_FATAL,
"Unable to collect page table information for allocated memory. "
"Ensure NUMA library is installed properly"};
}
size_t mistakeCount = 0;
for (size_t i = 0; i < numPages; i++) {
if (status[i] < 0) {
return {ERR_FATAL, "Unexpected page status (%d) for page %llu", status[i], i};
}
if (status[i] != targetId) { mistakeCount++; }
}
if (mistakeCount > 0) {
return {ERR_FATAL,
"%lu out of %lu pages for memory allocation were not on NUMA node %d."
" This could be due to hardware memory issues, or the use of numa-rebalancing "
"daemons such as numad",
mistakeCount,
numPages,
targetId};
}
return ERR_NONE;
}
// Allocate memory
static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr)
{
if (numBytes == 0) { return {ERR_FATAL, "Unable to allocate 0 bytes"}; }
*memPtr = nullptr;
MemType const& memType = memDevice.memType;
if (IsCpuMemType(memType)) {
// Determine which NUMA device to use
int numaIdx = memDevice.memIndex;
if (memType == MEM_CPU_CLOSEST) { numaIdx = GetClosestCpuNumaToGpu(memDevice.memIndex); }
// Set NUMA policy prior to call to hipHostMalloc
numa_set_preferred(numaIdx);
// Allocate host-pinned memory (should respect NUMA mem policy)
if (memType == MEM_CPU_FINE) {
// clang-format off
#if defined(__NVCC__)
return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"};
#else
ERR_CHECK(hipHostMalloc((void**)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocCoherent));
#endif
} else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
#if defined(__NVCC__)
ERR_CHECK(hipHostMalloc((void**)memPtr, numBytes, 0));
#else
ERR_CHECK(hipHostMalloc((void**)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
#endif
// clang-format on
} else if (memType == MEM_CPU_UNPINNED) {
*memPtr = numa_alloc_onnode(numBytes, numaIdx);
}
// Check that the allocated pages are actually on the correct NUMA node
memset(*memPtr, 0, numBytes);
ERR_CHECK(CheckPages((char*)*memPtr, numBytes, numaIdx));
// Reset to default numa mem policy
numa_set_preferred(-1);
} else if (IsGpuMemType(memType)) {
// Switch to the appropriate GPU
ERR_CHECK(hipSetDevice(memDevice.memIndex));
if (memType == MEM_GPU) {
// Allocate GPU memory on appropriate device
ERR_CHECK(hipMalloc((void**)memPtr, numBytes));
} else if (memType == MEM_GPU_FINE) {
// clang-format off
#if defined(__NVCC__)
return {ERR_FATAL, "Fine-grained GPU memory not supported on NVIDIA platform"};
#else
int flag = hipDeviceMallocUncached;
ERR_CHECK(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
#endif
// clang-format on
} else if (memType == MEM_MANAGED) {
ERR_CHECK(hipMallocManaged((void**)memPtr, numBytes));
}
// Clear the memory
ERR_CHECK(hipMemset(*memPtr, 0, numBytes));
ERR_CHECK(hipDeviceSynchronize());
} else {
return {ERR_FATAL, "Unsupported memory type (%d)", memType};
}
return ERR_NONE;
}
// Deallocate memory
static ErrResult DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
{
// Avoid deallocating nullptr
if (memPtr == nullptr) {
return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};
}
switch (memType) {
case MEM_CPU:
case MEM_CPU_FINE:
case MEM_CPU_CLOSEST: {
ERR_CHECK(hipHostFree(memPtr));
break;
}
case MEM_CPU_UNPINNED: {
numa_free(memPtr, bytes);
break;
}
case MEM_GPU:
case MEM_GPU_FINE:
case MEM_MANAGED: {
ERR_CHECK(hipFree(memPtr));
break;
}
default:
return {ERR_FATAL, "Attempting to deallocate unrecognized memory type (%d)", memType};
}
return ERR_NONE;
}
// HSA-related functions
//========================================================================================
#if !defined(__NVCC__)
// Get the hsa_agent_t associated with a ExeDevice
static ErrResult GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent)
{
static bool isInitialized = false;
static std::vector<hsa_agent_t> cpuAgents;
static std::vector<hsa_agent_t> gpuAgents;
int const& exeIndex = exeDevice.exeIndex;
int const numCpus = GetNumExecutors(EXE_CPU);
int const numGpus = GetNumExecutors(EXE_GPU_GFX);
// Initialize results on first use
if (!isInitialized) {
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
ErrResult err;
int32_t* tempBuffer;
// Index CPU agents
cpuAgents.clear();
for (int i = 0; i < numCpus; i++) {
ERR_CHECK(AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer));
ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
cpuAgents.push_back(info.agentOwner);
ERR_CHECK(DeallocateMemory(MEM_CPU, tempBuffer, 1024));
}
// Index GPU agents
gpuAgents.clear();
for (int i = 0; i < numGpus; i++) {
ERR_CHECK(AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer));
ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
gpuAgents.push_back(info.agentOwner);
ERR_CHECK(DeallocateMemory(MEM_GPU, tempBuffer, 1024));
}
isInitialized = true;
}
switch (exeDevice.exeType) {
case EXE_CPU:
if (exeIndex < 0 || exeIndex >= numCpus) {
return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1};
}
agent = cpuAgents[exeDevice.exeIndex];
break;
case EXE_GPU_GFX:
case EXE_GPU_DMA:
if (exeIndex < 0 || exeIndex >= numGpus) {
return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
}
agent = gpuAgents[exeIndex];
break;
default:
return {ERR_FATAL,
"Attempting to get HSA agent of unknown or unsupported executor type (%d)",
exeDevice.exeType};
}
return ERR_NONE;
}
// Get the hsa_agent_t associated with a MemDevice
static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
{
if (memDevice.memType == MEM_CPU_CLOSEST) {
return GetHsaAgent({EXE_CPU, GetClosestCpuNumaToGpu(memDevice.memIndex)}, agent);
}
if (IsCpuMemType(memDevice.memType)) {
return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
}
if (IsGpuMemType(memDevice.memType)) {
return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
}
return {ERR_FATAL,
"Unable to get HSA agent for memDevice (%d,%d)",
memDevice.memType,
memDevice.memIndex};
}
#endif
// Setup validation-related functions
//========================================================================================
static ErrResult GetActualExecutor(ConfigOptions const& cfg,
ExeDevice const& origExeDevice,
ExeDevice& actualExeDevice)
{
// By default, nothing needs to change
actualExeDevice = origExeDevice;
// When using NIC_NEAREST, remap to the closest NIC to the GPU
if (origExeDevice.exeType == EXE_NIC_NEAREST) {
actualExeDevice.exeType = EXE_NIC;
if (cfg.nic.closestNics.size() > 0) {
if (origExeDevice.exeIndex < 0 ||
static_cast<std::size_t>(origExeDevice.exeIndex) >= cfg.nic.closestNics.size()) {
return {ERR_FATAL, "NIC index is out of range (%d)", origExeDevice.exeIndex};
}
actualExeDevice.exeIndex = cfg.nic.closestNics[origExeDevice.exeIndex];
} else {
actualExeDevice.exeIndex = GetClosestNicToGpu(origExeDevice.exeIndex);
}
}
return ERR_NONE;
}
// Validate that MemDevice exists
static ErrResult CheckMemDevice(MemDevice const& memDevice)
{
if (memDevice.memType == MEM_NULL) { return ERR_NONE; }
if (IsCpuMemType(memDevice.memType) && memDevice.memType != MEM_CPU_CLOSEST) {
int numCpus = GetNumExecutors(EXE_CPU);
if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus) {
return {ERR_FATAL,
"CPU index must be between 0 and %d (instead of %d)",
numCpus - 1,
memDevice.memIndex};
}
return ERR_NONE;
}
if (IsGpuMemType(memDevice.memType) || memDevice.memType == MEM_CPU_CLOSEST) {
int numGpus = GetNumExecutors(EXE_GPU_GFX);
if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus) {
return {ERR_FATAL,
"GPU index must be between 0 and %d (instead of %d)",
numGpus - 1,
memDevice.memIndex};
}
if (memDevice.memType == MEM_CPU_CLOSEST) {
if (GetClosestCpuNumaToGpu(memDevice.memIndex) == -1) {
return {ERR_FATAL,
"Unable to determine closest NUMA node for GPU %d",
memDevice.memIndex};
}
}
return ERR_NONE;
}
return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType};
}
// Validate configuration options - return trues if and only if an fatal error is detected
static bool ConfigOptionsHaveErrors(ConfigOptions const& cfg, std::vector<ErrResult>& errors)
{
// Check general options
if (cfg.general.numWarmups < 0) {
errors.push_back({ERR_FATAL, "[general.numWarmups] must be a non-negative number"});
}
// Check data options
if (cfg.data.blockBytes == 0 || cfg.data.blockBytes % 4) {
errors.push_back(
{ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)});
}
if (cfg.data.byteOffset < 0 || cfg.data.byteOffset % sizeof(float)) {
errors.push_back(
{ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
}
if (cfg.data.fillCompress.size() > 0 && cfg.data.fillPattern.size() > 0) {
errors.push_back(
{ERR_WARN,
"[data.fillCompress] will override [data.fillPattern] when both are specified"});
}
if (cfg.data.fillCompress.size() > 0) {
int sum = 0;
for (int bin : cfg.data.fillCompress) { sum += bin; }
if (sum != 100) {
errors.push_back({ERR_FATAL, "[data.fillCompress] values must add up to 100"});
}
}
if (cfg.data.fillCompress.size() > 5) {
errors.push_back({ERR_FATAL, "[data.fillCompress] may only have up to 5 values"});
}
// Check GFX options
if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2) {
errors.push_back(
{ERR_FATAL,
"[gfx.blockOrder] must be 0 for sequential, 1 for interleaved, or 2 for random"});
}
if (cfg.gfx.useMultiStream && cfg.gfx.blockOrder > 0) {
errors.push_back(
{ERR_WARN, "[gfx.blockOrder] will be ignored when running in multi-stream mode"});
}
int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE);
if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize) {
errors.push_back(
{ERR_FATAL,
"[gfx.blockSize] must be positive multiple of 64 less than or equal to %d",
gfxMaxBlockSize});
}
if (cfg.gfx.temporalMode < 0 || cfg.gfx.temporalMode > 3) {
errors.push_back(
{ERR_FATAL, "[gfx.temporalMode] must be non-negative and less than or equal to 3"});
}
// clang-format off
#if defined(__NVCC__)
if (cfg.gfx.temporalMode > 0) {
errors.push_back({ERR_FATAL, "[gfx.temporalMode] is not supported on NVIDIA hardware"});
}
#endif
// clang-format on
int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL);
if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll) {
errors.push_back({ERR_FATAL,
"[gfx.unrollFactor] must be non-negative and less than or equal to %d",
gfxMaxUnroll});
}
if (cfg.gfx.waveOrder < 0 || cfg.gfx.waveOrder >= 6) {
errors.push_back({ERR_FATAL, "[gfx.waveOrder] must be non-negative and less than 6"});
}
if (!(cfg.gfx.wordSize == 1 || cfg.gfx.wordSize == 2 || cfg.gfx.wordSize == 4)) {
errors.push_back({ERR_FATAL, "[gfx.wordSize] must be either 1, 2 or 4"});
}
int numGpus = GetNumExecutors(EXE_GPU_GFX);
int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
vector<vector<int>> const& table = cfg.gfx.prefXccTable;
if (!table.empty()) {
if (table.size() != static_cast<std::size_t>(numGpus)) {
errors.push_back(
{ERR_FATAL, "[gfx.prefXccTable] must be have size %dx%d", numGpus, numGpus});
} else {
for (auto i = std::size_t(0); i < table.size(); i++) {
if (table[i].size() != static_cast<std::size_t>(numGpus)) {
errors.push_back({ERR_FATAL,
"[gfx.prefXccTable] must be have size %dx%d",
numGpus,
numGpus});
break;
} else {
for (auto x : table[i]) {
if (x < 0 || x >= numXccs) {
errors.push_back(
{ERR_FATAL,
"[gfx.prefXccTable] must contain values between 0 and %d",
numXccs - 1});
break;
}
}
}
}
}
}
// clang-format off
// Check NIC options
#ifdef NIC_EXEC_ENABLED
int numNics = GetNumExecutors(EXE_NIC);
for (auto const& nic : cfg.nic.closestNics) {
if (nic < 0 || nic >= static_cast<std::size_t>(numNics)) {
errors.push_back(
{ERR_FATAL, "NIC index (%d) in user-specified closest NIC list must be between 0 and %d", nic, numNics - 1});
}
}
size_t closetNicsSize = cfg.nic.closestNics.size();
if (closetNicsSize > 0 && closetNicsSize < static_cast<std::size_t>(numGpus)) {
errors.push_back({ERR_FATAL, "User-specified closest NIC list must match GPU count of %d", numGpus});
}
#endif
// clang-format on
// clang-format on
// NVIDIA specific
#if defined(__NVCC__)
if (cfg.data.validateDirect) {
errors.push_back({ERR_FATAL, "[data.validateDirect] is not supported on NVIDIA hardware"});
}
#else
// AMD specific
// Check for largeBar enablement on GPUs
for (int i = 0; i < numGpus; i++) {
int isLargeBar = 0;
hipError_t err = hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, i);
if (err != hipSuccess) {
errors.push_back({ERR_FATAL, "Unable to query if GPU %d has largeBAR enabled", i});
} else if (!isLargeBar) {
errors.push_back({ERR_WARN,
"Large BAR is not enabled for GPU %d in BIOS. "
"Large BAR is required to enable multi-gpu data access",
i});
}
}
#endif
// clang-format off
// Check for fatal errors
for (auto const& err : errors) {
if (err.errType == ERR_FATAL) {
return true;
}
}
return false;
}
// Validate Transfers to execute - returns true if and only if fatal error detected
static bool TransfersHaveErrors(ConfigOptions const& cfg, std::vector<Transfer> const& transfers, std::vector<ErrResult>& errors)
{
int numCpus = GetNumExecutors(EXE_CPU);
int numGpus = GetNumExecutors(EXE_GPU_GFX);
int numNics = GetNumExecutors(EXE_NIC);
std::set<ExeDevice> executors;
std::map<ExeDevice, int> transferCount;
std::map<ExeDevice, int> useSubIndexCount;
std::map<ExeDevice, int> totalSubExecs;
// Per-Transfer checks
for (size_t i = 0; i < transfers.size(); i++) {
Transfer const& t = transfers[i];
if (t.numBytes == 0) {
errors.push_back({ERR_FATAL, "Transfer %d: Cannot perform 0-byte transfers", i});
}
if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) {
size_t const N = t.numBytes / sizeof(float);
int const targetMultiple = cfg.data.blockBytes / sizeof(float);
int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)t.numSubExecs);
if (maxSubExecToUse < t.numSubExecs) {
errors.push_back({ERR_WARN,
"Transfer %d data size is too small - will only use %d of %d subexecutors",
i,
maxSubExecToUse,
t.numSubExecs});
}
}
// Check sources and destinations
if (t.srcs.empty() && t.dsts.empty()) {
errors.push_back({ERR_FATAL, "Transfer %d: Must have at least one source or destination", i});
}
for (auto j = std::size_t(0); j < t.srcs.size(); j++) {
ErrResult err = CheckMemDevice(t.srcs[j]);
if (err.errType != ERR_NONE) {
errors.push_back({ERR_FATAL, "Transfer %d: SRC %d: %s", i, j, err.errMsg.c_str()});
}
}
for (auto j = std::size_t(0); j < t.dsts.size(); j++) {
ErrResult err = CheckMemDevice(t.dsts[j]);
if (err.errType != ERR_NONE) {
errors.push_back({ERR_FATAL, "Transfer %d: DST %d: %s", i, j, err.errMsg.c_str()});
}
}
// Check executor
executors.insert(t.exeDevice);
transferCount[t.exeDevice]++;
switch (t.exeDevice.exeType) {
case EXE_CPU:
if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numCpus) {
errors.push_back({ERR_FATAL,
"Transfer %d: CPU index must be between 0 and %d (instead of %d)",
i,
numCpus - 1,
t.exeDevice.exeIndex});
}
break;
case EXE_GPU_GFX:
if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
errors.push_back({ERR_FATAL,
"Transfer %d: GFX index must be between 0 and %d (instead of %d)",
i,
numGpus - 1,
t.exeDevice.exeIndex});
} else {
if (t.exeSubIndex != -1) {
// clang-format off
#if defined(__NVCC__)
errors.push_back({ERR_FATAL, "Transfer %d: GFX executor subindex not supported on NVIDIA hardware", i});
#else
useSubIndexCount[t.exeDevice]++;
int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
if (t.exeSubIndex >= numSubIndices) {
errors.push_back(
{ERR_FATAL, "Transfer %d: GFX subIndex (XCC) must be between 0 and %d", i, numSubIndices - 1});
}
#endif
// clang-format on
}
}
break;
case EXE_GPU_DMA:
if (t.srcs.size() != 1 || t.dsts.size() != 1) {
errors.push_back(
{ERR_FATAL,
"Transfer %d: DMA executor must have exactly 1 source and 1 destination",
i});
}
if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
errors.push_back(
{ERR_FATAL,
"Transfer %d: DMA index must be between 0 and %d (instead of %d)",
i,
numGpus - 1,
t.exeDevice.exeIndex});
// Cannot proceed with any further checks
continue;
}
if (t.exeSubIndex != -1) {
// clang-format off
#if defined(__NVCC__)
errors.push_back({ERR_FATAL, "Transfer %d: DMA executor subindex not supported on NVIDIA hardware", i});
#else
useSubIndexCount[t.exeDevice]++;
int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
if (t.exeSubIndex >= numSubIndices) {
errors.push_back(
{ERR_FATAL, "Transfer %d: DMA subIndex (engine) must be between 0 and %d", i, numSubIndices - 1});
}
// Check that engine Id exists between agents
hsa_agent_t srcAgent, dstAgent;
ErrResult err;
err = GetHsaAgent(t.srcs[0], srcAgent);
if (err.errType != ERR_NONE) {
errors.push_back(err);
if (err.errType == ERR_FATAL) {
break;
}
}
err = GetHsaAgent(t.dsts[0], dstAgent);
if (err.errType != ERR_NONE) {
errors.push_back(err);
if (err.errType == ERR_FATAL) {
break;
}
}
// Skip check of engine Id mask for self copies
if (srcAgent.handle != dstAgent.handle) {
uint32_t engineIdMask = 0;
err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
if (err.errType != ERR_NONE) {
errors.push_back(err);
if (err.errType == ERR_FATAL) {
break;
}
}
hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
if (!(sdmaEngineId & engineIdMask)) {
errors.push_back({ERR_FATAL,
"Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
i,
t.exeDevice.exeIndex,
t.exeSubIndex});
}
}
#endif
// clang-format on
}
if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
errors.push_back({ERR_WARN,
"Transfer %d: No GPU memory for source or destination. Copy "
"might not execute on DMA %d",
i,
t.exeDevice.exeIndex});
} else {
// Currently HIP will use src agent if source memory is GPU, otherwise dst agent
if (IsGpuMemType(t.srcs[0].memType)) {
if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
errors.push_back({ERR_WARN,
"Transfer %d: DMA executor will automatically switch "
"to using the source "
"memory device (%d) not (%d)",
i,
t.srcs[0].memIndex,
t.exeDevice.exeIndex});
}
} else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
errors.push_back(
{ERR_WARN,
"Transfer %d: DMA executor will automatically switch to using the "
"destination memory device (%d) not (%d)",
i,
t.dsts[0].memIndex,
t.exeDevice.exeIndex});
}
}
break;
case EXE_NIC:
// clang-format off
#ifdef NIC_EXEC_ENABLED
{
int srcIndex = t.exeDevice.exeIndex;
int dstIndex = t.exeSubIndex;
if (srcIndex < 0 || srcIndex >= numNics) {
errors.push_back(
{ERR_FATAL, "Transfer %d: src NIC executor indexes an out-of-range NIC (%d)", i, srcIndex});
}
if (dstIndex < 0 || dstIndex >= numNics) {
errors.push_back(
{ERR_FATAL, "Transfer %d: dst NIC executor indexes an out-of-range NIC (%d)", i, dstIndex});
}
}
#else
errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i});
#endif
break;
case EXE_NIC_NEAREST:
#ifdef NIC_EXEC_ENABLED
{
ExeDevice srcExeDevice;
ErrResult errSrc = GetActualExecutor(cfg, t.exeDevice, srcExeDevice);
if (errSrc.errType != ERR_NONE) {
errors.push_back(errSrc);
}
ExeDevice dstExeDevice;
ErrResult errDst = GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice);
if (errDst.errType != ERR_NONE) {
errors.push_back(errDst);
}
}
#else
errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i});
#endif
// clang-format on
break;
}
// Check subexecutors
if (t.numSubExecs <= 0) {
errors.push_back({ERR_FATAL, "Transfer %d: # of subexecutors must be positive", i});
} else {
totalSubExecs[t.exeDevice] += t.numSubExecs;
}
}
int gpuMaxHwQueues = 4;
if (getenv("GPU_MAX_HW_QUEUES")) { gpuMaxHwQueues = atoi(getenv("GPU_MAX_HW_QUEUES")); }
// Aggregate checks
for (auto const& exeDevice : executors) {
switch (exeDevice.exeType) {
case EXE_CPU: {
// Check total number of subexecutors requested
int numCpuSubExec = GetNumSubExecutors(exeDevice);
if (totalSubExecs[exeDevice] > numCpuSubExec) {
errors.push_back({ERR_WARN,
"CPU %d requests %d total cores however only %d available. "
"Serialization will occur",
exeDevice.exeIndex,
totalSubExecs[exeDevice],
numCpuSubExec});
}
break;
}
case EXE_GPU_GFX: {
// Check total number of subexecutors requested
int numGpuSubExec = GetNumSubExecutors(exeDevice);
if (totalSubExecs[exeDevice] > numGpuSubExec) {
errors.push_back({ERR_WARN,
"GPU %d requests %d total CUs however only %d available. "
"Serialization will occur",
exeDevice.exeIndex,
totalSubExecs[exeDevice],
numGpuSubExec});
}
// Check that if executor subindices are used, all Transfers specify executor
// subindices
if (useSubIndexCount[exeDevice] > 0 &&
useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
errors.push_back({ERR_FATAL,
"GPU %d specifies XCC on only %d of %d Transfers. "
"Must either specific none or all",
exeDevice.exeIndex,
useSubIndexCount[exeDevice],
transferCount[exeDevice]});
}
if (cfg.gfx.useMultiStream && transferCount[exeDevice] > gpuMaxHwQueues) {
errors.push_back({ERR_WARN,
"GPU %d attempting %d parallel transfers, however "
"GPU_MAX_HW_QUEUES only set to %d",
exeDevice.exeIndex,
transferCount[exeDevice],
gpuMaxHwQueues});
}
break;
}
case EXE_GPU_DMA: {
// Check that if executor subindices are used, all Transfers specify executor
// subindices
if (useSubIndexCount[exeDevice] > 0 &&
useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
errors.push_back({ERR_FATAL,
"DMA %d specifies engine on only %d of %d Transfers. "
"Must either specific none or all",
exeDevice.exeIndex,
useSubIndexCount[exeDevice],
transferCount[exeDevice]});
}
if (transferCount[exeDevice] > gpuMaxHwQueues) {
errors.push_back({ERR_WARN,
"DMA %d attempting %d parallel transfers, however "
"GPU_MAX_HW_QUEUES only set to %d",
exeDevice.exeIndex,
transferCount[exeDevice],
gpuMaxHwQueues});
}
char* enableSdma = getenv("HSA_ENABLE_SDMA");
if (enableSdma && !strcmp(enableSdma, "0")) {
errors.push_back({ERR_WARN,
"DMA functionality disabled due to environment variable "
"HSA_ENABLE_SDMA=0. "
"DMA %d copies will fallback to blit (GFX) kernels",
exeDevice.exeIndex});
}
break;
}
default: break;
}
}
// Check for fatal errors
for (auto const& err : errors) {
if (err.errType == ERR_FATAL) { return true; }
}
return false;
}
// Internal data structures
//========================================================================================
// Parameters for each SubExecutor
struct SubExecParam
{
// Inputs
size_t N; ///< Number of floats this subExecutor works on
int numSrcs; ///< Number of source arrays
int numDsts; ///< Number of destination arrays
float* src[MAX_SRCS]; ///< Source array pointers
float* dst[MAX_DSTS]; ///< Destination array pointers
int32_t preferredXccId; ///< XCC ID to execute on (GFX only)
// Prepared
int teamSize; ///< Index of this sub executor amongst team
int teamIdx; ///< Size of team this sub executor is part of
// Outputs
long long startCycle; ///< Start timestamp for in-kernel timing (GPU-GFX executor)
long long stopCycle; ///< Stop timestamp for in-kernel timing (GPU-GFX executor)
uint32_t hwId; ///< Hardware ID
uint32_t xccId; ///< XCC ID
};
// Internal resources allocated per Transfer
struct TransferResources
{
int transferIdx; ///< The associated Transfer
size_t numBytes; ///< Number of bytes to Transfer
vector<float*> srcMem; ///< Source memory
vector<float*> dstMem; ///< Destination memory
vector<SubExecParam> subExecParamCpu; ///< Defines subarrays for each subexecutor
vector<int> subExecIdx; ///< Indices into subExecParamGpu
int numaNode; ///< NUMA node to use for this Transfer
// For GFX executor
SubExecParam* subExecParamGpuPtr;
// For targeted-SDMA
// clang-format off
#if !defined(__NVCC__)
hsa_agent_t dstAgent; ///< DMA destination memory agent
hsa_agent_t srcAgent; ///< DMA source memory agent
hsa_signal_t signal; ///< HSA signal for completion
hsa_amd_sdma_engine_id_t sdmaEngineId; ///< DMA engine ID
#endif
// clang-format on
// For IBV executor
// clang-format off
#ifdef NIC_EXEC_ENABLED
int srcNicIndex; ///< SRC NIC index
int dstNicIndex; ///< DST NIC index
ibv_context* srcContext; ///< Device context for SRC NIC
ibv_context* dstContext; ///< Device context for DST NIC
ibv_pd* srcProtect; ///< Protection domain for SRC NIC
ibv_pd* dstProtect; ///< Protection domain for DST NIC
ibv_cq* srcCompQueue; ///< Completion queue for SRC NIC
ibv_cq* dstCompQueue; ///< Completion queue for DST NIC
ibv_port_attr srcPortAttr; ///< Port attributes for SRC NIC
ibv_port_attr dstPortAttr; ///< Port attributes for DST NIC
ibv_gid srcGid; ///< GID handle for SRC NIC
ibv_gid dstGid; ///< GID handle for DST NIC
vector<ibv_qp*> srcQueuePairs; ///< Queue pairs for SRC NIC
vector<ibv_qp*> dstQueuePairs; ///< Queue pairs for DST NIC
ibv_mr* srcMemRegion; ///< Memory region for SRC
ibv_mr* dstMemRegion; ///< Memory region for DST
uint8_t qpCount; ///< Number of QPs to be used for transferring data
vector<ibv_sge> sgePerQueuePair; ///< Scatter-gather elements per queue pair
vector<ibv_send_wr> sendWorkRequests; ///< Send work requests per queue pair
#endif
// clang-format on
// Counters
double totalDurationMsec; ///< Total duration for all iterations for this Transfer
vector<double> perIterMsec; ///< Duration for each individual iteration
vector<set<pair<int, int>>> perIterCUs; ///< GFX-Executor only. XCC:CU used per iteration
};
// Internal resources allocated per Executor
struct ExeInfo
{
size_t totalBytes; ///< Total bytes this executor transfers
double totalDurationMsec; ///< Total duration for all iterations for this Executor
int totalSubExecs; ///< Total number of subExecutors to use
bool useSubIndices; ///< Use subexecutor indicies
int numSubIndices; ///< Number of subindices this ExeDevice has
vector<SubExecParam> subExecParamCpu; ///< Subexecutor parameters for this executor
vector<TransferResources> resources; ///< Per-Transfer resources
// For GPU-Executors
SubExecParam* subExecParamGpu; ///< GPU copy of subExecutor parameters
vector<hipStream_t> streams; ///< HIP streams to launch on
vector<hipEvent_t> startEvents; ///< HIP start timing event
vector<hipEvent_t> stopEvents; ///< HIP stop timing event
int wallClockRate; ///< (GFX-only) Device wall clock rate
};
// Structure to track PCIe topology
struct PCIeNode
{
std::string address; ///< PCIe address for this PCIe node
std::string description; ///< Description for this PCIe node
std::set<PCIeNode> children; ///< Children PCIe nodes
// Default constructor
PCIeNode() : address(""), description("") {}
// Constructor
PCIeNode(std::string const& addr) : address(addr) {}
// Constructor
PCIeNode(std::string const& addr, std::string const& desc)
: address(addr), description(desc)
{}
// Comparison operator for std::set
bool operator<(PCIeNode const& other) const { return address < other.address; }
};
#ifdef NIC_EXEC_ENABLED
// Structure to track information about IBV devices
struct IbvDevice
{
ibv_device* devicePtr;
std::string name;
std::string busId;
bool hasActivePort;
int numaNode;
int gidIndex;
std::string gidDescriptor;
bool isRoce;
};
#endif
#ifdef NIC_EXEC_ENABLED
// Function to collect information about IBV devices
//========================================================================================
static bool IsConfiguredGid(union ibv_gid const& gid)
{
const struct in6_addr* a = (struct in6_addr*)gid.raw;
int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
if (((a->s6_addr32[0] | trailer) == 0UL) ||
((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
return false;
}
return true;
}
static bool LinkLocalGid(union ibv_gid const& gid)
{
const struct in6_addr* a = (struct in6_addr*)gid.raw;
if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) { return true; }
return false;
}
static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
int const& portNum,
int const& gidIndex,
int& version)
{
char const* deviceName = ibv_get_device_name(context->device);
char gidRoceVerStr[16] = {};
char roceTypePath[PATH_MAX] = {};
sprintf(roceTypePath,
"/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
deviceName,
portNum,
gidIndex);
int fd = open(roceTypePath, O_RDONLY);
if (fd == -1) { return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath}; }
int ret = read(fd, gidRoceVerStr, 15);
close(fd);
if (ret == -1) { return {ERR_FATAL, "Failed while reading RoCE version"}; }
if (strlen(gidRoceVerStr)) {
if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0 ||
strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
version = 1;
} else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
version = 2;
}
}
return ERR_NONE;
}
static bool IsIPv4MappedIPv6(const union ibv_gid& gid)
{
// look for ::ffff:x.x.x.x format
// From Broadcom documentation
// https://techdocs.broadcom.com/us/en/storage-and-ethernet-connectivity/ethernet-nic-controllers/bcm957xxx/adapters/frequently-asked-questions1.html
// "The IPv4 address is really an IPv4 address mapped into the IPv6 address space.
// This can be identified by 80 “0” bits, followed by 16 “1” bits (“FFFF” in hexadecimal)
// followed by the original 32-bit IPv4 address."
return (gid.global.subnet_prefix == 0 && gid.raw[8] == 0 && gid.raw[9] == 0 &&
gid.raw[10] == 0xff && gid.raw[11] == 0xff);
}
static ErrResult GetGidIndex(struct ibv_context* context,
int const& gidTblLen,
int const& portNum,
std::pair<int, std::string>& gidInfo)
{
if (gidInfo.first >= 0) {
return ERR_NONE; // honor user choice
}
union ibv_gid gid;
GidPriority highestPriority = GidPriority::UNKNOWN;
int gidIndex = -1;
for (int i = 0; i < gidTblLen; ++i) {
IBV_CALL(ibv_query_gid, context, portNum, i, &gid);
if (!IsConfiguredGid(gid)) { continue; }
int gidCurrRoceVersion;
if (GetRoceVersionNumber(context, portNum, i, gidCurrRoceVersion).errType != ERR_NONE) {
continue;
}
GidPriority currPriority;
if (IsIPv4MappedIPv6(gid)) {
currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV4
: GidPriority::ROCEV1_IPV4;
} else if (!LinkLocalGid(gid)) {
currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV6
: GidPriority::ROCEV1_IPV6;
} else {
currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_LINK_LOCAL
: GidPriority::ROCEV1_LINK_LOCAL;
}
if (currPriority > highestPriority) {
highestPriority = currPriority;
gidIndex = i;
}
}
if (highestPriority == GidPriority::UNKNOWN) {
gidInfo.first = -1;
return {ERR_FATAL,
"Failed to auto-detect a valid GID index. Try setting it manually through "
"IB_GID_INDEX"};
}
gidInfo.first = gidIndex;
gidInfo.second = GidPriorityStr[highestPriority];
return ERR_NONE;
}
static vector<IbvDevice>& GetIbvDeviceList()
{
static bool isInitialized = false;
static vector<IbvDevice> ibvDeviceList = {};
// Build list on first use
if (!isInitialized) {
// Query the number of IBV devices
int numIbvDevices = 0;
ibv_device** deviceList = ibv_get_device_list(&numIbvDevices);
if (deviceList && numIbvDevices > 0) {
// Loop over each device to collect information
for (int i = 0; i < numIbvDevices; i++) {
IbvDevice ibvDevice;
ibvDevice.devicePtr = deviceList[i];
ibvDevice.name = deviceList[i]->name;
ibvDevice.hasActivePort = false;
{
struct ibv_context* context = ibv_open_device(ibvDevice.devicePtr);
if (context) {
struct ibv_device_attr deviceAttr;
if (!ibv_query_device(context, &deviceAttr)) {
int activePort;
ibvDevice.gidIndex = -1;
for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) {
struct ibv_port_attr portAttr;
if (ibv_query_port(context, port, &portAttr)) { continue; }
if (portAttr.state == IBV_PORT_ACTIVE) {
activePort = port;
ibvDevice.hasActivePort = true;
if (portAttr.link_layer == IBV_LINK_LAYER_ETHERNET) {
ibvDevice.isRoce = true;
std::pair<int, std::string> gidInfo(-1, "");
auto res = GetGidIndex(
context, portAttr.gid_tbl_len, activePort, gidInfo);
if (res.errType == ERR_NONE) {
ibvDevice.gidIndex = gidInfo.first;
ibvDevice.gidDescriptor = gidInfo.second;
}
}
break;
}
}
}
ibv_close_device(context);
}
}
ibvDevice.busId = "";
{
std::string device_path(ibvDevice.devicePtr->dev_path);
if (std::filesystem::exists(device_path)) {
std::string pciPath = std::filesystem::canonical(device_path + "/device")
.string();
std::size_t pos = pciPath.find_last_of('/');
if (pos != std::string::npos) { ibvDevice.busId = pciPath.substr(pos + 1); }
}
}
// Get nearest numa node for this device
ibvDevice.numaNode = -1;
std::filesystem::path devicePath = "/sys/bus/pci/devices/" + ibvDevice.busId +
"/numa_node";
std::string canonicalPath = std::filesystem::canonical(devicePath).string();
if (std::filesystem::exists(canonicalPath)) {
std::ifstream file(canonicalPath);
if (file.is_open()) {
std::string numaNodeStr;
std::getline(file, numaNodeStr);
int numaNodeVal;
if (sscanf(numaNodeStr.c_str(), "%d", &numaNodeVal) == 1) {
ibvDevice.numaNode = numaNodeVal;
}
file.close();
}
}
ibvDeviceList.push_back(ibvDevice);
}
}
ibv_free_device_list(deviceList);
isInitialized = true;
}
return ibvDeviceList;
}
#endif // NIC_EXEC_ENABLED
// clang-format on
#ifdef NIC_EXEC_ENABLED
// PCIe-related functions
//========================================================================================
// Prints off PCIe tree
static void PrintPCIeTree(PCIeNode const& node, std::string const& prefix = "", bool isLast = true)
{
if (!node.address.empty()) {
printf("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str());
if (!node.description.empty()) { printf("(%s)", node.description.c_str()); }
printf("\n");
}
auto const& children = node.children;
for (auto it = children.begin(); it != children.end(); ++it) {
PrintPCIeTree(*it, prefix + (isLast ? " " : "│ "), std::next(it) == children.end());
}
}
// Inserts nodes along pcieAddress down a tree starting from root
static ErrResult InsertPCIePathToTree(std::string const& pcieAddress,
std::string const& description,
PCIeNode& root)
{
std::filesystem::path devicePath = "/sys/bus/pci/devices/" + pcieAddress;
std::string canonicalPath = std::filesystem::canonical(devicePath).string();
if (!std::filesystem::exists(devicePath)) {
return {ERR_FATAL, "Device path %s does not exist", devicePath.c_str()};
}
std::istringstream iss(canonicalPath);
std::string token;
PCIeNode* currNode = &root;
while (std::getline(iss, token, '/')) {
auto it = (currNode->children.insert(PCIeNode(token))).first;
currNode = const_cast<PCIeNode*>(&(*it));
}
currNode->description = description;
return ERR_NONE;
}
// Returns root node for PCIe tree. Constructed on first use
static PCIeNode* GetPCIeTreeRoot()
{
static bool isInitialized = false;
static PCIeNode pcieRoot;
// Build PCIe tree on first use
if (!isInitialized) {
// Add NICs to the tree
int numNics = GetNumExecutors(EXE_NIC);
auto const& ibvDeviceList = GetIbvDeviceList();
for (IbvDevice const& ibvDevice : ibvDeviceList) {
if (!ibvDevice.hasActivePort || ibvDevice.busId == "") { continue; }
InsertPCIePathToTree(ibvDevice.busId, ibvDevice.name, pcieRoot);
}
// Add GPUs to the tree
int numGpus = GetNumExecutors(EXE_GPU_GFX);
for (int i = 0; i < numGpus; ++i) {
char hipPciBusId[64];
if (hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i) == hipSuccess) {
InsertPCIePathToTree(hipPciBusId, "GPU " + std::to_string(i), pcieRoot);
}
}
#ifdef VERBS_DEBUG
PrintPCIeTree(pcieRoot);
#endif
isInitialized = true;
}
return &pcieRoot;
}
// Finds the lowest common ancestor in PCIe tree between two nodes
static PCIeNode const* GetLcaBetweenNodes(PCIeNode const* root,
std::string const& node1Address,
std::string const& node2Address)
{
if (!root || root->address == node1Address || root->address == node2Address) { return root; }
PCIeNode const* lcaFound1 = nullptr;
PCIeNode const* lcaFound2 = nullptr;
// Recursively iterate over children
for (auto const& child : root->children) {
PCIeNode const* lca = GetLcaBetweenNodes(&child, node1Address, node2Address);
if (!lca) { continue; }
if (!lcaFound1) {
// First time found
lcaFound1 = lca;
} else {
// Second time found
lcaFound2 = lca;
break;
}
}
// If two children were found, then current node is the lowest common ancestor
return (lcaFound1 && lcaFound2) ? root : lcaFound1;
}
// Gets the depth of an node in the PCIe tree
static int GetLcaDepth(std::string const& targetBusID, PCIeNode const* const& node, int depth = 0)
{
if (!node) { return -1; }
if (targetBusID == node->address) { return depth; }
for (auto const& child : node->children) {
int distance = GetLcaDepth(targetBusID, &child, depth + 1);
if (distance != -1) { return distance; }
}
return -1;
}
// Function to extract the bus number from a PCIe address (domain:bus:device.function)
static int ExtractBusNumber(std::string const& pcieAddress)
{
int domain, bus, device, function;
char delimiter;
std::istringstream iss(pcieAddress);
iss >> std::hex >> domain >> delimiter >> bus >> delimiter >> device >> delimiter >> function;
if (iss.fail()) {
#ifdef VERBS_DEBUG
printf("Invalid PCIe address format: %s\n", pcieAddress.c_str());
#endif
return -1;
}
return bus;
}
// Function to compute the distance between two bus IDs
static int GetBusIdDistance(std::string const& pcieAddress1, std::string const& pcieAddress2)
{
int bus1 = ExtractBusNumber(pcieAddress1);
int bus2 = ExtractBusNumber(pcieAddress2);
return (bus1 < 0 || bus2 < 0) ? -1 : std::abs(bus1 - bus2);
}
// Given a target busID and a set of candidate devices, returns a set of indices
// that is "closest" to the target
static std::set<int> GetNearestDevicesInTree(std::string const& targetBusId,
std::vector<std::string> const& candidateBusIdList)
{
int maxDepth = -1;
int minDistance = std::numeric_limits<int>::max();
std::set<int> matches = {};
// Loop over the candidates to find the ones with the lowest common ancestor (LCA)
for (int i = 0; i < candidateBusIdList.size(); i++) {
std::string const& candidateBusId = candidateBusIdList[i];
if (candidateBusId == "") { continue; }
PCIeNode const* lca = GetLcaBetweenNodes(GetPCIeTreeRoot(), targetBusId, candidateBusId);
if (!lca) { continue; }
int depth = GetLcaDepth(lca->address, GetPCIeTreeRoot());
int currDistance = GetBusIdDistance(targetBusId, candidateBusId);
// When more than one LCA match is found, choose the one with smallest busId difference
// NOTE: currDistance could be -1, which signals problem with parsing, however still
// remains a valid "closest" candidate, so is included
if (depth > maxDepth || (depth == maxDepth && depth >= 0 && currDistance < minDistance)) {
maxDepth = depth;
matches.clear();
matches.insert(i);
minDistance = currDistance;
} else if (depth == maxDepth && depth >= 0 && currDistance == minDistance) {
matches.insert(i);
}
}
return matches;
}
#endif // NIC_EXEC_ENABLED
#ifdef NIC_EXEC_ENABLED
// IB Verbs-related functions
//========================================================================================
// Create a queue pair
static ErrResult CreateQueuePair(ConfigOptions const& cfg,
struct ibv_pd* pd,
struct ibv_cq* cq,
struct ibv_qp*& qp)
{
// Set queue pair attributes
struct ibv_qp_init_attr attr = {};
attr.qp_type = IBV_QPT_RC; // Set type to reliable connection
attr.send_cq = cq; // Send completion queue
attr.recv_cq = cq; // Recv completion queue
attr.cap.max_send_wr = cfg.nic.maxSendWorkReq; // Max send work requests
attr.cap.max_recv_wr = cfg.nic.maxRecvWorkReq; // Max recv work requests
attr.cap.max_send_sge = 1; // Max send scatter-gather entries
attr.cap.max_recv_sge = 1; // Max recv scatter-gather entries
qp = ibv_create_qp(pd, &attr);
if (qp == NULL) { return {ERR_FATAL, "Error while creating QP"}; }
return ERR_NONE;
}
// Initialize a queue pair
static ErrResult InitQueuePair(struct ibv_qp* qp, uint8_t port, unsigned flags)
{
struct ibv_qp_attr attr = {}; // Clear all attributes
attr.qp_state = IBV_QPS_INIT; // Set the QP state to INIT
attr.pkey_index = 0; // Set the partition key index to 0
attr.port_num = port; // Set the port number to the defined IB_PORT
attr.qp_access_flags = flags; // Set the QP access flags to the provided flags
int ret = ibv_modify_qp(qp,
&attr,
IBV_QP_STATE | // Modify the QP state
IBV_QP_PKEY_INDEX | // Modify the partition key index
IBV_QP_PORT | // Modify the port number
IBV_QP_ACCESS_FLAGS); // Modify the access flags
if (ret != 0) { return {ERR_FATAL, "Error during QP Init. IB Verbs Error code: %d", ret}; }
return ERR_NONE;
}
// Transition QueuePair to Ready to Receive State
static ErrResult TransitionQpToRtr(ibv_qp* qp,
uint16_t const& dlid,
uint32_t const& dqpn,
ibv_gid const& gid,
uint8_t const& gidIndex,
uint8_t const& port,
bool const& isRoCE,
ibv_mtu const& mtu)
{
// Prepare QP attributes
struct ibv_qp_attr attr = {};
attr.qp_state = IBV_QPS_RTR;
attr.path_mtu = mtu;
attr.rq_psn = 0;
attr.max_dest_rd_atomic = 1;
attr.min_rnr_timer = 12;
if (isRoCE) {
attr.ah_attr.is_global = 1;
attr.ah_attr.grh.dgid.global.subnet_prefix = gid.global.subnet_prefix;
attr.ah_attr.grh.dgid.global.interface_id = gid.global.interface_id;
attr.ah_attr.grh.flow_label = 0;
attr.ah_attr.grh.sgid_index = gidIndex;
attr.ah_attr.grh.hop_limit = 255;
} else {
attr.ah_attr.is_global = 0;
attr.ah_attr.dlid = dlid;
}
attr.ah_attr.sl = 0;
attr.ah_attr.src_path_bits = 0;
attr.ah_attr.port_num = port;
attr.dest_qp_num = dqpn;
// Modify the QP
int ret = ibv_modify_qp(qp,
&attr,
IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER);
if (ret != 0) { return {ERR_FATAL, "Error during QP RTR. IB Verbs Error code: %d", ret}; }
return ERR_NONE;
}
// Transition QueuePair to Ready to Send state
static ErrResult TransitionQpToRts(struct ibv_qp* qp)
{
struct ibv_qp_attr attr = {};
attr.qp_state = IBV_QPS_RTS;
attr.sq_psn = 0;
attr.timeout = 14;
attr.retry_cnt = 7;
attr.rnr_retry = 7;
attr.max_rd_atomic = 1;
int ret = ibv_modify_qp(qp,
&attr,
IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN | IBV_QP_MAX_QP_RD_ATOMIC);
if (ret != 0) { return {ERR_FATAL, "Error during QP RTS. IB Verbs Error code: %d", ret}; }
return ERR_NONE;
}
static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg,
ExeDevice const& srcExeDevice,
Transfer const& t,
TransferResources& rss)
{
// Switch to the closest NUMA node to this NIC
int numaNode = GetIbvDeviceList()[srcExeDevice.exeIndex].numaNode;
if (numaNode != -1) { numa_run_on_node(numaNode); }
int const port = cfg.nic.ibPort;
// Figure out destination NIC (Accounts for possible remap due to use of EXE_NIC_NEAREST)
ExeDevice dstExeDevice;
ERR_CHECK(GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice));
rss.srcNicIndex = srcExeDevice.exeIndex;
rss.dstNicIndex = dstExeDevice.exeIndex;
rss.qpCount = t.numSubExecs;
// Check for valid NICs and active ports
int numNics = GetNumExecutors(EXE_NIC);
if (rss.srcNicIndex < 0 || rss.srcNicIndex >= numNics) {
return {ERR_FATAL, "SRC NIC index is out of range (%d)", rss.srcNicIndex};
}
if (rss.dstNicIndex < 0 || rss.dstNicIndex >= numNics) {
return {ERR_FATAL, "DST NIC index is out of range (%d)", rss.dstNicIndex};
}
if (!GetIbvDeviceList()[rss.srcNicIndex].hasActivePort) {
return {ERR_FATAL, "SRC NIC %d is not active\n", rss.srcNicIndex};
}
if (!GetIbvDeviceList()[rss.dstNicIndex].hasActivePort) {
return {ERR_FATAL, "DST NIC %d is not active\n", rss.dstNicIndex};
}
// Queue pair flags
unsigned int rdmaAccessFlags = (IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_ATOMIC);
unsigned int rdmaMemRegFlags = rdmaAccessFlags;
if (cfg.nic.useRelaxedOrder) { rdmaMemRegFlags |= IBV_ACCESS_RELAXED_ORDERING; }
// Open NIC contexts
IBV_PTR_CALL(rss.srcContext, ibv_open_device, GetIbvDeviceList()[rss.srcNicIndex].devicePtr);
IBV_PTR_CALL(rss.dstContext, ibv_open_device, GetIbvDeviceList()[rss.dstNicIndex].devicePtr);
// Open protection domains
IBV_PTR_CALL(rss.srcProtect, ibv_alloc_pd, rss.srcContext);
IBV_PTR_CALL(rss.dstProtect, ibv_alloc_pd, rss.dstContext);
// Register memory region
IBV_PTR_CALL(
rss.srcMemRegion, ibv_reg_mr, rss.srcProtect, rss.srcMem[0], rss.numBytes, rdmaMemRegFlags);
IBV_PTR_CALL(
rss.dstMemRegion, ibv_reg_mr, rss.dstProtect, rss.dstMem[0], rss.numBytes, rdmaMemRegFlags);
// Create completion queues
IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, cfg.nic.queueSize, NULL, NULL, 0);
IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, cfg.nic.queueSize, NULL, NULL, 0);
// Get port attributes
IBV_CALL(ibv_query_port, rss.srcContext, port, &rss.srcPortAttr);
IBV_CALL(ibv_query_port, rss.dstContext, port, &rss.dstPortAttr);
if (rss.srcPortAttr.link_layer != rss.dstPortAttr.link_layer) {
return {ERR_FATAL,
"SRC NIC (%d) and DST NIC (%d) do not have the same link layer",
rss.srcNicIndex,
rss.dstNicIndex};
}
// Prepare GID index
int srcGidIndex = cfg.nic.ibGidIndex;
int dstGidIndex = cfg.nic.ibGidIndex;
// Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately
bool isRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET);
if (isRoCE) {
// Try to auto-detect the GID index
std::pair<int, std::string> srcGidInfo(srcGidIndex, "");
std::pair<int, std::string> dstGidInfo(dstGidIndex, "");
ERR_CHECK(
GetGidIndex(rss.srcContext, rss.srcPortAttr.gid_tbl_len, cfg.nic.ibPort, srcGidInfo));
ERR_CHECK(
GetGidIndex(rss.dstContext, rss.dstPortAttr.gid_tbl_len, cfg.nic.ibPort, dstGidInfo));
srcGidIndex = srcGidInfo.first;
dstGidIndex = dstGidInfo.first;
IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid);
IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid);
}
// Prepare queue pairs and send elements
rss.srcQueuePairs.resize(rss.qpCount);
rss.dstQueuePairs.resize(rss.qpCount);
rss.sgePerQueuePair.resize(rss.qpCount);
rss.sendWorkRequests.resize(rss.qpCount);
for (int i = 0; i < rss.qpCount; ++i) {
// Create scatter-gather element for the portion of memory assigned to this queue pair
ibv_sge sg = {};
sg.addr = (uint64_t)rss.subExecParamCpu[i].src[0];
sg.length = rss.subExecParamCpu[i].N * sizeof(float);
sg.lkey = rss.srcMemRegion->lkey;
rss.sgePerQueuePair[i] = sg;
// Create send work request
ibv_send_wr wr = {};
wr.wr_id = i;
wr.sg_list = &rss.sgePerQueuePair[i];
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_WRITE;
wr.send_flags = IBV_SEND_SIGNALED;
wr.wr.rdma.remote_addr = (uint64_t)rss.subExecParamCpu[i].dst[0];
wr.wr.rdma.rkey = rss.dstMemRegion->rkey;
rss.sendWorkRequests[i] = wr;
// Create SRC/DST queue pairs
ERR_CHECK(CreateQueuePair(cfg, rss.srcProtect, rss.srcCompQueue, rss.srcQueuePairs[i]));
ERR_CHECK(CreateQueuePair(cfg, rss.dstProtect, rss.dstCompQueue, rss.dstQueuePairs[i]));
// Initialize SRC/DST queue pairs
ERR_CHECK(InitQueuePair(rss.srcQueuePairs[i], port, rdmaAccessFlags));
ERR_CHECK(InitQueuePair(rss.dstQueuePairs[i], port, rdmaAccessFlags));
// Transition the SRC queue pair to ready to receive
ERR_CHECK(TransitionQpToRtr(rss.srcQueuePairs[i],
rss.dstPortAttr.lid,
rss.dstQueuePairs[i]->qp_num,
rss.dstGid,
dstGidIndex,
port,
isRoCE,
rss.srcPortAttr.active_mtu));
// Transition the SRC queue pair to ready to send
ERR_CHECK(TransitionQpToRts(rss.srcQueuePairs[i]));
// Transition the DST queue pair to ready to receive
ERR_CHECK(TransitionQpToRtr(rss.dstQueuePairs[i],
rss.srcPortAttr.lid,
rss.srcQueuePairs[i]->qp_num,
rss.srcGid,
srcGidIndex,
port,
isRoCE,
rss.dstPortAttr.active_mtu));
// Transition the DST queue pair to ready to send
ERR_CHECK(TransitionQpToRts(rss.dstQueuePairs[i]));
}
return ERR_NONE;
}
static ErrResult TeardownNicTransferResources(TransferResources& rss)
{
// Deregister memory regions
IBV_CALL(ibv_dereg_mr, rss.srcMemRegion);
IBV_CALL(ibv_dereg_mr, rss.dstMemRegion);
// Destroy queue pairs
for (auto srcQueuePair : rss.srcQueuePairs) { IBV_CALL(ibv_destroy_qp, srcQueuePair); }
rss.srcQueuePairs.clear();
for (auto dstQueuePair : rss.dstQueuePairs) { IBV_CALL(ibv_destroy_qp, dstQueuePair); }
rss.dstQueuePairs.clear();
// Destroy completion queues
IBV_CALL(ibv_destroy_cq, rss.srcCompQueue);
IBV_CALL(ibv_destroy_cq, rss.dstCompQueue);
// Deallocate protection domains
IBV_CALL(ibv_dealloc_pd, rss.srcProtect);
IBV_CALL(ibv_dealloc_pd, rss.dstProtect);
// Destroy context
IBV_CALL(ibv_close_device, rss.srcContext);
IBV_CALL(ibv_close_device, rss.dstContext);
return ERR_NONE;
}
#endif // NIC_EXEC_ENABLED
// Data validation-related functions
//========================================================================================
// Pseudo-random formula for each element in array
static __host__ float PrepSrcValue(int srcBufferIdx, size_t idx)
{
return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
}
// Fills a pre-sized buffer with the pattern, based on which src index buffer
// Note: Can also generate expected dst buffer
static void PrepareReference(ConfigOptions const& cfg, std::vector<float>& cpuBuffer, int bufferIdx)
{
size_t N = cpuBuffer.size();
if (!cfg.data.fillCompress.empty()) {
// 0 -> Random
// 1 -> 1B0 - The upper 1 byte of each aligned 2 bytes is 0
// 2 -> 2B0 - The upper 2 bytes of each aligned 4 bytes are 0
// 3 -> 4B0 - The upper 4 bytes of each aligned 8 bytes are 0
// 4 -> 32B0 - The upper 32 bytes of each 64-byte line are 0
// Fill buffer with random floats
std::mt19937 gen;
gen.seed(bufferIdx * 425);
std::uniform_real_distribution<float> dist(-100000.0f, +100000.0f);
for (size_t i = 0; i < N; i++) { cpuBuffer[i] = dist(gen); }
// Figure out distribution for lines based on the percentages given
size_t numLines = N / 16;
size_t leftover = numLines;
std::vector<size_t> lineCounts(5, 0);
std::set<std::pair<double, int>> remainder;
// Assign rounded down values first
std::vector<int> percentages = cfg.data.fillCompress;
while (percentages.size() < 5) { percentages.push_back(0); }
for (auto i = std::size_t(0); i < percentages.size(); i++) {
lineCounts[i] = (size_t)(numLines * (percentages[i] / 100.0));
leftover -= lineCounts[i];
remainder.insert(
std::make_pair(numLines * (percentages[i] / 100.0) - lineCounts[i], i));
}
// Assign leftovers based on largest remainder
while (leftover != 0) {
auto last = *remainder.rbegin();
lineCounts[last.second]++;
remainder.erase(last);
leftover--;
}
// Randomly decide which lines get assigned to which types
std::vector<int> lineTypes(numLines, 0);
int offset = lineCounts[0];
for (int i = 1; i < 5; i++) {
for (auto j = std::size_t(0); j < lineCounts[i]; j++) { lineTypes[offset++] = i; }
}
std::shuffle(lineTypes.begin(), lineTypes.end(), gen);
// Apply zero-ing
int dumpLines = getenv("DUMP_LINES") ? atoi(getenv("DUMP_LINES")) : 0;
if (dumpLines) {
printf("Input pattern 64B line statistics for bufferIdx %d:\n", bufferIdx);
printf("Total lines: %lu\n", numLines);
printf("- 0: Random : %8lu (%8.3f%%)\n",
lineCounts[0],
100.0 * lineCounts[0] / (1.0 * numLines));
printf("- 1: 1B0 : %8lu (%8.3f%%)\n",
lineCounts[1],
100.0 * lineCounts[1] / (1.0 * numLines));
printf("- 2: 2B0 : %8lu (%8.3f%%)\n",
lineCounts[2],
100.0 * lineCounts[2] / (1.0 * numLines));
printf("- 3: 4B0 : %8lu (%8.3f%%)\n",
lineCounts[3],
100.0 * lineCounts[3] / (1.0 * numLines));
printf("- 4: 32B0 : %8lu (%8.3f%%)\n",
lineCounts[4],
100.0 * lineCounts[4] / (1.0 * numLines));
}
for (auto line = std::size_t(0); line < numLines; line++) {
unsigned char* linePtr = (unsigned char*)&cpuBuffer[line * 16];
switch (lineTypes[line]) {
case 1: // 1B0
for (int i = 0; i < 32; i++) { linePtr[2 * i + 1] = 0; }
break;
case 2: // 2B0
for (int i = 0; i < 16; i++) {
linePtr[4 * i + 2] = 0;
linePtr[4 * i + 3] = 0;
}
break;
case 3: // 4B0
for (int i = 0; i < 8; i++) {
linePtr[8 * i + 4] = 0;
linePtr[8 * i + 5] = 0;
linePtr[8 * i + 6] = 0;
linePtr[8 * i + 7] = 0;
}
break;
case 4: // 32B0
for (int i = 32; i < 64; i++) { linePtr[i] = 0; }
break;
}
if (line < static_cast<std::size_t>(dumpLines)) {
printf("Line %02zu [%d]: ", line, lineTypes[line]);
for (int j = 63; j >= 0; j--) {
printf("%02x ", linePtr[j]);
if (j % 16 == 0) { printf(" "); }
}
printf("\n");
}
}
} else {
// Use fill pattern if specified
size_t patternLen = cfg.data.fillPattern.size();
if (patternLen > 0) {
size_t copies = N / patternLen;
size_t leftOver = N % patternLen;
float* cpuBufferPtr = cpuBuffer.data();
for (auto i = std::size_t(0); i < copies; i++) {
memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), patternLen * sizeof(float));
cpuBufferPtr += patternLen;
}
if (leftOver) {
memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), leftOver * sizeof(float));
}
} else {
// Fall back to pseudo-random
for (size_t i = 0; i < N; ++i) { cpuBuffer[i] = PrepSrcValue(bufferIdx, i); }
}
}
}
// Checks that destination buffers match expected values
static ErrResult ValidateAllTransfers(ConfigOptions const& cfg,
vector<Transfer> const& transfers,
vector<TransferResources*> const& transferResources,
vector<vector<float>> const& dstReference,
vector<float>& outputBuffer)
{
float* output;
size_t initOffset = cfg.data.byteOffset / sizeof(float);
for (auto rss : transferResources) {
int transferIdx = rss->transferIdx;
Transfer const& t = transfers[transferIdx];
size_t N = t.numBytes / sizeof(float);
float const* expected = dstReference[t.srcs.size()].data();
for (auto dstIdx = std::size_t(0); dstIdx < rss->dstMem.size(); dstIdx++) {
if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) {
output = (rss->dstMem[dstIdx]) + initOffset;
} else {
ERR_CHECK(hipMemcpy(outputBuffer.data(),
(rss->dstMem[dstIdx]) + initOffset,
t.numBytes,
hipMemcpyDefault));
ERR_CHECK(hipDeviceSynchronize());
output = outputBuffer.data();
}
if (memcmp(output, expected, t.numBytes)) {
// Difference found - find first error
for (size_t i = 0; i < N; i++) {
if (output[i] != expected[i]) {
return {ERR_FATAL,
"Transfer %d: Unexpected mismatch at index %lu of destination %d: "
"Expected %10.5f "
"Actual: %10.5f",
transferIdx,
i,
dstIdx,
expected[i],
output[i]};
}
}
return {ERR_FATAL,
"Transfer %d: Unexpected output mismatch for destination %d",
transferIdx,
dstIdx};
}
}
}
return ERR_NONE;
}
// Preparation-related functions
//========================================================================================
// Prepares input parameters for each subexecutor
// Determines how sub-executors will split up the work
// Initializes counters
static ErrResult PrepareSubExecParams(ConfigOptions const& cfg,
Transfer const& transfer,
TransferResources& rss)
{
// Each subExecutor needs to know src/dst pointers and how many elements to transfer
// Figure out the sub-array each subExecutor works on for this Transfer
// - Partition N as evenly as possible, but try to keep subarray sizes as multiples of
// data.blockBytes
// except the very last one, for alignment reasons
size_t const N = transfer.numBytes / sizeof(float);
int const initOffset = cfg.data.byteOffset / sizeof(float);
int const targetMultiple = cfg.data.blockBytes / sizeof(float);
// In some cases, there may not be enough data for all subExectors
int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
(size_t)transfer.numSubExecs);
vector<SubExecParam>& subExecParam = rss.subExecParamCpu;
subExecParam.clear();
subExecParam.resize(transfer.numSubExecs);
size_t assigned = 0;
for (int i = 0; i < transfer.numSubExecs; ++i) {
SubExecParam& p = subExecParam[i];
p.numSrcs = rss.srcMem.size();
p.numDsts = rss.dstMem.size();
p.startCycle = 0;
p.stopCycle = 0;
p.hwId = 0;
p.xccId = 0;
// In single team mode, subexecutors stripe across the entire array
if (cfg.gfx.useSingleTeam && transfer.exeDevice.exeType == EXE_GPU_GFX) {
p.N = N;
p.teamSize = transfer.numSubExecs;
p.teamIdx = i;
for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) {
p.src[iSrc] = rss.srcMem[iSrc] + initOffset;
}
for (int iDst = 0; iDst < p.numDsts; ++iDst) {
p.dst[iDst] = rss.dstMem[iDst] + initOffset;
}
} else {
// Otherwise, each subexecutor works on separate subarrays
int const subExecLeft = std::max(0, maxSubExecToUse - i);
size_t const leftover = N - assigned;
size_t const roundedN = (leftover + targetMultiple - 1) / targetMultiple;
p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
p.teamSize = 1;
p.teamIdx = 0;
for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) {
p.src[iSrc] = rss.srcMem[iSrc] + initOffset + assigned;
}
for (int iDst = 0; iDst < p.numDsts; ++iDst) {
p.dst[iDst] = rss.dstMem[iDst] + initOffset + assigned;
}
assigned += p.N;
}
p.preferredXccId = transfer.exeSubIndex;
// Override if XCC table has been specified
vector<vector<int>> const& table = cfg.gfx.prefXccTable;
if (transfer.exeDevice.exeType == EXE_GPU_GFX && transfer.exeSubIndex == -1 &&
!table.empty() && transfer.dsts.size() == 1 && IsGpuMemType(transfer.dsts[0].memType)) {
if (table.size() <= static_cast<std::size_t>(transfer.exeDevice.exeIndex) ||
table[transfer.exeDevice.exeIndex].size() <=
static_cast<std::size_t>(transfer.dsts[0].memIndex)) {
return {ERR_FATAL, "[gfx.xccPrefTable] is too small"};
}
p.preferredXccId = table[transfer.exeDevice.exeIndex][transfer.dsts[0].memIndex];
if (p.preferredXccId < 0 ||
p.preferredXccId >= GetNumExecutorSubIndices(transfer.exeDevice)) {
return {ERR_FATAL,
"[gfx.xccPrefTable] defines out-of-bound XCC index %d",
p.preferredXccId};
}
}
}
// Clear counters
rss.totalDurationMsec = 0.0;
return ERR_NONE;
}
// Prepare each executor
// Allocates memory for src/dst, prepares subexecutors, executor-specific data structures
static ErrResult PrepareExecutor(ConfigOptions const& cfg,
vector<Transfer> const& transfers,
ExeDevice const& exeDevice,
ExeInfo& exeInfo)
{
exeInfo.totalDurationMsec = 0.0;
// Loop over each transfer this executor is involved in
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
rss.numBytes = t.numBytes;
// Allocate source memory
rss.srcMem.resize(t.srcs.size());
for (auto iSrc = std::size_t(0); iSrc < t.srcs.size(); ++iSrc) {
MemDevice const& srcMemDevice = t.srcs[iSrc];
// Ensure executing GPU can access source memory
if (IsGpuExeType(exeDevice.exeType) && IsGpuMemType(srcMemDevice.memType) &&
srcMemDevice.memIndex != exeDevice.exeIndex) {
ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, srcMemDevice.memIndex));
}
ERR_CHECK(AllocateMemory(
srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc]));
}
// Allocate destination memory
rss.dstMem.resize(t.dsts.size());
for (auto iDst = std::size_t(0); iDst < t.dsts.size(); ++iDst) {
MemDevice const& dstMemDevice = t.dsts[iDst];
// Ensure executing GPU can access destination memory
if (IsGpuExeType(exeDevice.exeType) && IsGpuMemType(dstMemDevice.memType) &&
dstMemDevice.memIndex != exeDevice.exeIndex) {
ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, dstMemDevice.memIndex));
}
ERR_CHECK(AllocateMemory(
dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst]));
}
if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
#if !defined(__NVCC__)
// Collect HSA agent information
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[0], &info, NULL, NULL, NULL));
rss.dstAgent = info.agentOwner;
ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
rss.srcAgent = info.agentOwner;
// Create HSA completion signal
ERR_CHECK(hsa_signal_create(1, 0, NULL, &rss.signal));
if (t.exeSubIndex != -1) {
rss.sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
}
#endif
}
// Prepare subexecutor parameters
ERR_CHECK(PrepareSubExecParams(cfg, t, rss));
}
// Prepare additional requirements for GPU-based executors
if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
// Determine how many streams to use
int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA ||
(exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream))
? exeInfo.resources.size()
: 1;
exeInfo.streams.resize(numStreamsToUse);
// Create streams
for (int i = 0; i < numStreamsToUse; ++i) {
if (cfg.gfx.cuMask.size()) {
#if !defined(__NVCC__)
ERR_CHECK(hipExtStreamCreateWithCUMask(
&exeInfo.streams[i], cfg.gfx.cuMask.size(), cfg.gfx.cuMask.data()));
#else
return {ERR_FATAL, "CU Masking in not supported on NVIDIA hardware"};
#endif
} else {
ERR_CHECK(hipStreamCreate(&exeInfo.streams[i]));
}
}
if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
exeInfo.startEvents.resize(numStreamsToUse);
exeInfo.stopEvents.resize(numStreamsToUse);
for (int i = 0; i < numStreamsToUse; ++i) {
ERR_CHECK(hipEventCreate(&exeInfo.startEvents[i]));
ERR_CHECK(hipEventCreate(&exeInfo.stopEvents[i]));
}
}
}
// Prepare for GPU GFX executor
if (exeDevice.exeType == EXE_GPU_GFX) {
// Allocate one contiguous chunk of GPU memory for threadblock parameters
// This allows support for executing one transfer per stream, or all transfers in a single
// stream
#if !defined(__NVCC__)
MemType memType = MEM_GPU; // AMD hardware can directly access GPU memory from host
#else
MemType
memType = MEM_MANAGED; // NVIDIA hardware requires managed memory to access from host
#endif
ERR_CHECK(AllocateMemory({memType, exeDevice.exeIndex},
exeInfo.totalSubExecs * sizeof(SubExecParam),
(void**)&exeInfo.subExecParamGpu));
// Create subexecutor parameter array for entire executor
exeInfo.subExecParamCpu.clear();
exeInfo.numSubIndices = GetNumExecutorSubIndices(exeDevice);
#if defined(__NVCC__)
exeInfo.wallClockRate = 1000000;
#else
ERR_CHECK(hipDeviceGetAttribute(
&exeInfo.wallClockRate, hipDeviceAttributeWallClockRate, exeDevice.exeIndex));
#endif
int transferOffset = 0;
if (cfg.gfx.useMultiStream || cfg.gfx.blockOrder == 0) {
// Threadblocks are ordered sequentially one transfer at a time
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
for (auto p : rss.subExecParamCpu) {
rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
exeInfo.subExecParamCpu.push_back(p);
transferOffset++;
}
}
} else if (cfg.gfx.blockOrder == 1) {
// Interleave threadblocks of different Transfers
for (int subExecIdx = 0;
exeInfo.subExecParamCpu.size() < static_cast<std::size_t>(exeInfo.totalSubExecs);
++subExecIdx) {
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
if (subExecIdx < t.numSubExecs) {
rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[subExecIdx]);
}
}
}
} else if (cfg.gfx.blockOrder == 2) {
// Build randomized threadblock list
std::vector<std::pair<int, int>> indices;
for (auto i = std::size_t(0); i < exeInfo.resources.size(); i++) {
auto const& rss = exeInfo.resources[i];
Transfer const& t = transfers[rss.transferIdx];
for (int j = 0; j < t.numSubExecs; j++) { indices.push_back(std::make_pair(i, j)); }
}
std::random_device rd;
std::mt19937 gen(rd());
std::shuffle(indices.begin(), indices.end(), gen);
// Build randomized threadblock list
for (auto p : indices) {
auto& rss = exeInfo.resources[p.first];
rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[p.second]);
}
}
// Copy sub executor parameters to GPU
ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
ERR_CHECK(hipMemcpy(exeInfo.subExecParamGpu,
exeInfo.subExecParamCpu.data(),
exeInfo.totalSubExecs * sizeof(SubExecParam),
hipMemcpyHostToDevice));
ERR_CHECK(hipDeviceSynchronize());
}
// Prepare for NIC-based executors
if (IsNicExeType(exeDevice.exeType)) {
#ifdef NIC_EXEC_ENABLED
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
ERR_CHECK(PrepareNicTransferResources(cfg, exeDevice, t, rss));
}
#else
return {ERR_FATAL, "RDMA executor is not supported"};
#endif
}
return ERR_NONE;
}
// Teardown-related functions
//========================================================================================
// Clean up all resources
static ErrResult TeardownExecutor(ConfigOptions const& cfg,
ExeDevice const& exeDevice,
vector<Transfer> const& transfers,
ExeInfo& exeInfo)
{
// Loop over each transfer this executor is involved in
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
// Deallocate source memory
for (auto iSrc = std::size_t(0); iSrc < t.srcs.size(); ++iSrc) {
ERR_CHECK(DeallocateMemory(
t.srcs[iSrc].memType, rss.srcMem[iSrc], t.numBytes + cfg.data.byteOffset));
}
// Deallocate destination memory
for (auto iDst = std::size_t(0); iDst < t.dsts.size(); ++iDst) {
ERR_CHECK(DeallocateMemory(
t.dsts[iDst].memType, rss.dstMem[iDst], t.numBytes + cfg.data.byteOffset));
}
// Destroy HSA signal for DMA executor
#if !defined(__NVCC__)
if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
ERR_CHECK(hsa_signal_destroy(rss.signal));
}
#endif
// Destroy NIC related resources
#ifdef NIC_EXEC_ENABLED
if (IsNicExeType(exeDevice.exeType)) { ERR_CHECK(TeardownNicTransferResources(rss)); }
#endif
}
// Teardown additional requirements for GPU-based executors
if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
for (auto stream : exeInfo.streams) { ERR_CHECK(hipStreamDestroy(stream)); }
if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
for (auto event : exeInfo.startEvents) { ERR_CHECK(hipEventDestroy(event)); }
for (auto event : exeInfo.stopEvents) { ERR_CHECK(hipEventDestroy(event)); }
}
}
if (exeDevice.exeType == EXE_GPU_GFX) {
#if !defined(__NVCC__)
MemType memType = MEM_GPU;
#else
MemType memType = MEM_MANAGED;
#endif
ERR_CHECK(DeallocateMemory(
memType, exeInfo.subExecParamGpu, exeInfo.totalSubExecs * sizeof(SubExecParam)));
}
return ERR_NONE;
}
// CPU Executor-related functions
//========================================================================================
// Kernel for CPU execution (run by a single subexecutor)
static void CpuReduceKernel(SubExecParam const& p, int numSubIterations)
{
if (p.N == 0) { return; }
int subIteration = 0;
do {
int const& numSrcs = p.numSrcs;
int const& numDsts = p.numDsts;
if (numSrcs == 0) {
for (int i = 0; i < numDsts; ++i) {
memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
// for (int j = 0; j < p.N; j++) p.dst[i][j] = MEMSET_VAL;
}
} else if (numSrcs == 1) {
float const* __restrict__ src = p.src[0];
if (numDsts == 0) {
float sum = 0.0;
for (auto j = std::size_t(0); j < p.N; j++) { sum += p.src[0][j]; }
// Add a dummy check to ensure the read is not optimized out
if (sum != sum) { printf("[ERROR] Nan detected\n"); }
} else {
for (int i = 0; i < numDsts; ++i) { memcpy(p.dst[i], src, p.N * sizeof(float)); }
}
} else {
float sum = 0.0f;
for (auto j = std::size_t(0); j < p.N; j++) {
sum = p.src[0][j];
for (int i = 1; i < numSrcs; i++) { sum += p.src[i][j]; }
for (int i = 0; i < numDsts; i++) { p.dst[i][j] = sum; }
}
}
} while (++subIteration != numSubIterations);
}
// Execution of a single CPU Transfers
static void ExecuteCpuTransfer(int const iteration,
ConfigOptions const& cfg,
[[maybe_unused]] int const exeIndex,
TransferResources& rss)
{
auto cpuStart = std::chrono::high_resolution_clock::now();
vector<std::thread> childThreads;
for (auto const& subExecParam : rss.subExecParamCpu) {
childThreads.emplace_back(
std::thread(CpuReduceKernel, std::cref(subExecParam), cfg.general.numSubIterations));
}
for (auto& subExecThread : childThreads) { subExecThread.join(); }
childThreads.clear();
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
.count() *
1000.0) /
cfg.general.numSubIterations;
if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { rss.perIterMsec.push_back(deltaMsec); }
}
}
// Execution of a single CPU executor
static ErrResult RunCpuExecutor(int const iteration,
ConfigOptions const& cfg,
int const exeIndex,
ExeInfo& exeInfo)
{
numa_run_on_node(exeIndex);
auto cpuStart = std::chrono::high_resolution_clock::now();
vector<std::thread> asyncTransfers;
for (auto& rss : exeInfo.resources) {
asyncTransfers.emplace_back(
std::thread(ExecuteCpuTransfer, iteration, std::cref(cfg), exeIndex, std::ref(rss)));
}
for (auto& asyncTransfer : asyncTransfers) { asyncTransfer.join(); }
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() *
1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) { exeInfo.totalDurationMsec += deltaMsec; }
return ERR_NONE;
}
#ifdef NIC_EXEC_ENABLED
// Execution of a single NIC Transfer
static ErrResult ExecuteNicTransfer(int const iteration,
ConfigOptions const& cfg,
int const exeIndex,
TransferResources& rss)
{
// Loop over each of the queue pairs and post the send
ibv_send_wr* badWorkReq;
for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
int error = ibv_post_send(
rss.srcQueuePairs[qpIndex], &rss.sendWorkRequests[qpIndex], &badWorkReq);
if (error) {
return {ERR_FATAL,
"Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n",
rss.transferIdx,
qpIndex,
error};
}
}
return ERR_NONE;
}
// Execution of a single NIC executor
static ErrResult RunNicExecutor(int const iteration,
ConfigOptions const& cfg,
int const exeIndex,
ExeInfo& exeInfo)
{
// Switch to the closest NUMA node to this NIC
if (cfg.nic.useNuma) {
int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
if (numaNode != -1) { numa_run_on_node(numaNode); }
}
auto transferCount = exeInfo.resources.size();
std::vector<double> totalTimeMsec(transferCount, 0.0);
int subIterations = 0;
auto cpuStart = std::chrono::high_resolution_clock::now();
std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
do {
std::vector<uint8_t> receivedQPs(transferCount, 0);
// post the sends
for (auto i = 0; i < transferCount; i++) {
transferTimers[i] = std::chrono::high_resolution_clock::now();
ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
}
// poll for completions
size_t completedTransfers = 0;
while (completedTransfers < transferCount) {
for (auto i = 0; i < transferCount; i++) {
if (receivedQPs[i] < exeInfo.resources[i].qpCount) {
auto& rss = exeInfo.resources[i];
// Poll the completion queue until all queue pairs are complete
// The order of completion doesn't matter because this completion queue is
// dedicated to this Transfer
ibv_wc wc;
int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc);
if (nc > 0) {
receivedQPs[i]++;
if (wc.status != IBV_WC_SUCCESS) {
return {ERR_FATAL,
"Transfer %d: Received unsuccessful work completion",
rss.transferIdx};
}
} else if (nc < 0) {
return {ERR_FATAL,
"Transfer %d: Received negative work completion",
rss.transferIdx};
}
if (receivedQPs[i] == rss.qpCount) {
auto cpuDelta = std::chrono::high_resolution_clock::now() -
transferTimers[i];
double
deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(
cpuDelta)
.count() *
1000.0;
if (iteration >= 0) { totalTimeMsec[i] += deltaMsec; }
completedTransfers++;
}
}
}
}
} while (++subIterations < cfg.general.numSubIterations);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() *
1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) {
exeInfo.totalDurationMsec += deltaMsec;
for (int i = 0; i < transferCount; i++) {
auto& rss = exeInfo.resources[i];
double transferTimeMsec = totalTimeMsec[i] / cfg.general.numSubIterations;
rss.totalDurationMsec += transferTimeMsec;
if (cfg.general.recordPerIteration) { rss.perIterMsec.push_back(transferTimeMsec); }
}
}
return ERR_NONE;
}
#endif
// GFX Executor-related functions
//========================================================================================
// Converts register value to a CU/SM index
static uint32_t GetId(uint32_t hwId)
{
#if defined(__NVCC_)
return hwId;
#else
// Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
int const shId = (hwId >> 12) & 1;
int const cuId = (hwId >> 8) & 15;
int const seId = (hwId >> 13) & 3;
return (shId << 5) + (cuId << 2) + seId;
#endif
}
// Device level timestamp function
__device__ int64_t GetTimestamp()
{
#if defined(__NVCC__)
int64_t result;
asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
return result;
#else
return wall_clock64();
#endif
}
// Helper function for memset
template<typename T>
__device__ __forceinline__ T MemsetVal();
template<>
__device__ __forceinline__ float MemsetVal()
{
return MEMSET_VAL;
};
template<>
__device__ __forceinline__ float2 MemsetVal()
{
return make_float2(MEMSET_VAL, MEMSET_VAL);
};
template<>
__device__ __forceinline__ float4 MemsetVal()
{
return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL);
}
// Helper function for temporal/non-temporal reads / writes
#define TEMPORAL_NONE 0
#define TEMPORAL_LOAD 1
#define TEMPORAL_STORE 2
#define TEMPORAL_BOTH 3
template<int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float const* src, float& dst)
{
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst = __builtin_nontemporal_load(src);
#endif
} else {
dst = *src;
}
}
template<int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float2 const* src, float2& dst)
{
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst.x = __builtin_nontemporal_load(&(src->x));
dst.y = __builtin_nontemporal_load(&(src->y));
#endif
} else {
dst = *src;
}
}
template<int TEMPORAL_MODE>
__device__ __forceinline__ void Load(float4 const* src, float4& dst)
{
if (TEMPORAL_MODE & TEMPORAL_LOAD) {
#if !defined(__NVCC__)
dst.x = __builtin_nontemporal_load(&(src->x));
dst.y = __builtin_nontemporal_load(&(src->y));
dst.z = __builtin_nontemporal_load(&(src->z));
dst.w = __builtin_nontemporal_load(&(src->w));
#endif
} else {
dst = *src;
}
}
template<int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float const& src, float* dst)
{
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src, dst);
#endif
} else {
*dst = src;
}
}
template<int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float2 const& src, float2* dst)
{
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src.x, &(dst->x));
__builtin_nontemporal_store(src.y, &(dst->y));
#endif
} else {
*dst = src;
}
}
template<int TEMPORAL_MODE>
__device__ __forceinline__ void Store(float4 const& src, float4* dst)
{
if (TEMPORAL_MODE & TEMPORAL_STORE) {
#if !defined(__NVCC__)
__builtin_nontemporal_store(src.x, &(dst->x));
__builtin_nontemporal_store(src.y, &(dst->y));
__builtin_nontemporal_store(src.z, &(dst->z));
__builtin_nontemporal_store(src.w, &(dst->w));
#endif
} else {
*dst = src;
}
}
// Kernel for GFX execution
template<typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
{
int64_t startCycle;
if (threadIdx.x == 0) { startCycle = GetTimestamp(); }
SubExecParam& p = params[blockIdx.y];
// Filter by XCC
#if !defined(__NVCC__)
int32_t xccId;
GetXccId(xccId);
if (p.preferredXccId != -1 && xccId != p.preferredXccId) { return; }
#endif
// Collect data information
int32_t const numSrcs = p.numSrcs;
int32_t const numDsts = p.numDsts;
PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
PACKED_FLOAT* __restrict__ dstFloatPacked[MAX_DSTS];
for (int i = 0; i < numSrcs; i++) { srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i]; }
for (int i = 0; i < numDsts; i++) { dstFloatPacked[i] = (PACKED_FLOAT*)p.dst[i]; }
// Operate on wavefront granularity
int32_t const
nTeams = p.teamSize; // Number of threadblocks working together on this subarray
int32_t const teamIdx = p.teamIdx; // Index of this threadblock within the team
int32_t const nWaves = BLOCKSIZE / warpSize; // Number of wavefronts within this threadblock
int32_t const waveIdx = threadIdx.x /
warpSize; // Index of this wavefront within the threadblock
int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront
size_t const numPackedFloat = p.N / (sizeof(PACKED_FLOAT) / sizeof(float));
int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
switch (waveOrder) {
case 0: /* U,W,C */
unrlStride = 1;
waveStride = UNROLL;
teamStride = UNROLL * nWaves;
teamStride2 = nWaves;
waveStride2 = 1;
break;
case 1: /* U,C,W */
unrlStride = 1;
teamStride = UNROLL;
waveStride = UNROLL * nTeams;
teamStride2 = 1;
waveStride2 = nTeams;
break;
case 2: /* W,U,C */
waveStride = 1;
unrlStride = nWaves;
teamStride = nWaves * UNROLL;
teamStride2 = nWaves;
waveStride2 = 1;
break;
case 3: /* W,C,U */
waveStride = 1;
teamStride = nWaves;
unrlStride = nWaves * nTeams;
teamStride2 = nWaves;
waveStride2 = 1;
break;
case 4: /* C,U,W */
teamStride = 1;
unrlStride = nTeams;
waveStride = nTeams * UNROLL;
teamStride2 = 1;
waveStride2 = nTeams;
break;
case 5: /* C,W,U */
teamStride = 1;
waveStride = nTeams;
unrlStride = nTeams * nWaves;
teamStride2 = 1;
waveStride2 = nTeams;
break;
}
int subIterations = 0;
while (1) {
// First loop: Each wavefront in the team works on UNROLL PACKED_FLOAT per thread
size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
size_t const loop1Limit = numPackedFloat / loop1Stride * loop1Stride;
{
PACKED_FLOAT val[UNROLL];
PACKED_FLOAT tmp[UNROLL];
if (numSrcs == 0) {
#pragma unroll
for (int u = 0; u < UNROLL; u++) { val[u] = MemsetVal<PACKED_FLOAT>(); }
}
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx;
idx < loop1Limit;
idx += loop1Stride) {
// Read sources into memory and accumulate in registers
if (numSrcs) {
#pragma unroll
for (int u = 0; u < UNROLL; u++) {
Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx + u * unrlStride * warpSize],
val[u]);
}
for (int s = 1; s < numSrcs; s++) {
#pragma unroll
for (int u = 0; u < UNROLL; u++) {
Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx + u * unrlStride * warpSize],
tmp[u]);
}
#pragma unroll
for (int u = 0; u < UNROLL; u++) { val[u] += tmp[u]; }
}
}
// Write accumulation to all outputs
for (int d = 0; d < numDsts; d++) {
#pragma unroll
for (int u = 0; u < UNROLL; u++) {
Store<TEMPORAL_MODE>(val[u],
&dstFloatPacked[d][idx + u * unrlStride * warpSize]);
}
}
}
}
// Second loop: Deal with remaining PACKED_FLOAT
{
if (loop1Limit < numPackedFloat) {
PACKED_FLOAT val, tmp;
if (numSrcs == 0) { val = MemsetVal<PACKED_FLOAT>(); }
size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit +
(teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
idx < numPackedFloat;
idx += loop2Stride) {
if (numSrcs) {
Load<TEMPORAL_MODE>(&srcFloatPacked[0][idx], val);
for (int s = 1; s < numSrcs; s++) {
Load<TEMPORAL_MODE>(&srcFloatPacked[s][idx], tmp);
val += tmp;
}
}
for (int d = 0; d < numDsts; d++) {
Store<TEMPORAL_MODE>(val, &dstFloatPacked[d][idx]);
}
}
}
}
// Third loop; Deal with remaining floats
{
if (numPackedFloat * (sizeof(PACKED_FLOAT) / sizeof(float)) < p.N) {
float val, tmp;
if (numSrcs == 0) { val = MemsetVal<float>(); }
size_t const loop3Stride = nTeams * nWaves * warpSize;
for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT) / sizeof(float)) +
(teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
idx < p.N;
idx += loop3Stride) {
if (numSrcs) {
Load<TEMPORAL_MODE>(&p.src[0][idx], val);
for (int s = 1; s < numSrcs; s++) {
Load<TEMPORAL_MODE>(&p.src[s][idx], tmp);
val += tmp;
}
}
for (int d = 0; d < numDsts; d++) { Store<TEMPORAL_MODE>(val, &p.dst[d][idx]); }
}
}
}
if (++subIterations == numSubIterations) { break; }
}
// Wait for all threads to finish
__syncthreads();
if (threadIdx.x == 0) {
__threadfence_system();
p.stopCycle = GetTimestamp();
p.startCycle = startCycle;
GetHwId(p.hwId);
GetXccId(p.xccId);
}
}
#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \
{ \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_STORE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_BOTH> \
}
#define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL) \
{ \
GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float), \
GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float2), \
GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float4) \
}
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{ \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1), GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 2), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 3), GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 4), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 5), GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 6), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7), GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8) \
}
// Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] = {
GPU_KERNEL_UNROLL_DECL(64),
GPU_KERNEL_UNROLL_DECL(128),
GPU_KERNEL_UNROLL_DECL(192),
GPU_KERNEL_UNROLL_DECL(256),
GPU_KERNEL_UNROLL_DECL(320),
GPU_KERNEL_UNROLL_DECL(384),
GPU_KERNEL_UNROLL_DECL(448),
GPU_KERNEL_UNROLL_DECL(512),
GPU_KERNEL_UNROLL_DECL(576),
GPU_KERNEL_UNROLL_DECL(640),
GPU_KERNEL_UNROLL_DECL(704),
GPU_KERNEL_UNROLL_DECL(768),
GPU_KERNEL_UNROLL_DECL(832),
GPU_KERNEL_UNROLL_DECL(896),
GPU_KERNEL_UNROLL_DECL(960),
GPU_KERNEL_UNROLL_DECL(1024),
};
#undef GPU_KERNEL_UNROLL_DECL
#undef GPU_KERNEL_DWORD_DECL
#undef GPU_KERNEL_TEMPORAL_DECL
// Execute a single GPU Transfer (when using 1 stream per Transfer)
static ErrResult ExecuteGpuTransfer(int const iteration,
hipStream_t const stream,
hipEvent_t const startEvent,
hipEvent_t const stopEvent,
int const xccDim,
ConfigOptions const& cfg,
TransferResources& rss)
{
auto cpuStart = std::chrono::high_resolution_clock::now();
int numSubExecs = rss.subExecParamCpu.size();
dim3 const gridSize(xccDim, numSubExecs, 1);
dim3 const blockSize(cfg.gfx.blockSize, 1);
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : cfg.gfx.wordSize == 2 ? 1 : 2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize / 64 - 1][cfg.gfx.unrollFactor - 1]
[wordSizeIdx][cfg.gfx.temporalMode];
#if defined(__NVCC__)
if (startEvent != NULL) { ERR_CHECK(hipEventRecord(startEvent, stream)); }
gpuKernel<<<gridSize, blockSize, 0, stream>>>(
rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (stopEvent != NULL) { ERR_CHECK(hipEventRecord(stopEvent, stream)); }
#else
hipExtLaunchKernelGGL(gpuKernel,
gridSize,
blockSize,
0,
stream,
startEvent,
stopEvent,
0,
rss.subExecParamGpuPtr,
cfg.gfx.waveOrder,
cfg.general.numSubIterations);
#endif
ERR_CHECK(hipStreamSynchronize(stream));
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
.count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec;
if (startEvent != NULL) {
float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
}
rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) {
rss.perIterMsec.push_back(deltaMsec);
std::set<std::pair<int, int>> CUs;
for (int i = 0; i < numSubExecs; i++) {
CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId,
GetId(rss.subExecParamGpuPtr[i].hwId)));
}
rss.perIterCUs.push_back(CUs);
}
}
return ERR_NONE;
}
// Execute a single GPU executor
static ErrResult RunGpuExecutor(int const iteration,
ConfigOptions const& cfg,
int const exeIndex,
ExeInfo& exeInfo)
{
auto cpuStart = std::chrono::high_resolution_clock::now();
ERR_CHECK(hipSetDevice(exeIndex));
int xccDim = exeInfo.useSubIndices ? exeInfo.numSubIndices : 1;
if (cfg.gfx.useMultiStream) {
// Launch each Transfer separately in its own stream
vector<std::future<ErrResult>> asyncTransfers;
for (auto i = std::size_t(0); i < exeInfo.streams.size(); i++) {
asyncTransfers.emplace_back(
std::async(std::launch::async,
ExecuteGpuTransfer,
iteration,
exeInfo.streams[i],
cfg.gfx.useHipEvents ? exeInfo.startEvents[i] : NULL,
cfg.gfx.useHipEvents ? exeInfo.stopEvents[i] : NULL,
xccDim,
std::cref(cfg),
std::ref(exeInfo.resources[i])));
}
for (auto& asyncTransfer : asyncTransfers) { ERR_CHECK(asyncTransfer.get()); }
} else {
// Combine all the Transfers into a single kernel launch
int numSubExecs = exeInfo.totalSubExecs;
dim3 const gridSize(xccDim, numSubExecs, 1);
dim3 const blockSize(cfg.gfx.blockSize, 1);
hipStream_t stream = exeInfo.streams[0];
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : cfg.gfx.wordSize == 2 ? 1 : 2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize / 64 - 1][cfg.gfx.unrollFactor - 1]
[wordSizeIdx][cfg.gfx.temporalMode];
#if defined(__NVCC__)
if (cfg.gfx.useHipEvents) { ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream)); }
gpuKernel<<<gridSize, blockSize, 0, stream>>>(
exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (cfg.gfx.useHipEvents) { ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream)); }
#else
hipExtLaunchKernelGGL(gpuKernel,
gridSize,
blockSize,
0,
stream,
cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL,
0,
exeInfo.subExecParamGpu,
cfg.gfx.waveOrder,
cfg.general.numSubIterations);
#endif
ERR_CHECK(hipStreamSynchronize(stream));
}
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
.count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) {
if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
float gpuDeltaMsec;
ERR_CHECK(
hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
gpuDeltaMsec /= cfg.general.numSubIterations;
exeInfo.totalDurationMsec += gpuDeltaMsec;
} else {
exeInfo.totalDurationMsec += cpuDeltaMsec;
}
// Determine timing for each of the individual transfers that were part of this launch
if (!cfg.gfx.useMultiStream) {
for (auto i = std::size_t(0); i < exeInfo.resources.size(); i++) {
TransferResources& rss = exeInfo.resources[i];
long long minStartCycle = std::numeric_limits<long long>::max();
long long maxStopCycle = std::numeric_limits<long long>::min();
std::set<std::pair<int, int>> CUs;
for (auto subExecIdx : rss.subExecIdx) {
minStartCycle = std::min(minStartCycle,
exeInfo.subExecParamGpu[subExecIdx].startCycle);
maxStopCycle = std::max(maxStopCycle,
exeInfo.subExecParamGpu[subExecIdx].stopCycle);
if (cfg.general.recordPerIteration) {
CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
}
}
double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
deltaMsec /= cfg.general.numSubIterations;
rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) {
rss.perIterMsec.push_back(deltaMsec);
rss.perIterCUs.push_back(CUs);
}
}
}
}
return ERR_NONE;
}
// DMA Executor-related functions
//========================================================================================
// Execute a single DMA Transfer
static ErrResult ExecuteDmaTransfer(int const iteration,
bool const useSubIndices,
hipStream_t const stream,
hipEvent_t const startEvent,
hipEvent_t const stopEvent,
ConfigOptions const& cfg,
TransferResources& resources)
{
auto cpuStart = std::chrono::high_resolution_clock::now();
int subIterations = 0;
if (!useSubIndices && !cfg.dma.useHsaCopy) {
if (cfg.dma.useHipEvents) { ERR_CHECK(hipEventRecord(startEvent, stream)); }
// Use hipMemcpy
do {
ERR_CHECK(hipMemcpyAsync(resources.dstMem[0],
resources.srcMem[0],
resources.numBytes,
hipMemcpyDefault,
stream));
} while (++subIterations != cfg.general.numSubIterations);
if (cfg.dma.useHipEvents) { ERR_CHECK(hipEventRecord(stopEvent, stream)); }
ERR_CHECK(hipStreamSynchronize(stream));
} else {
#if defined(__NVCC__)
return {ERR_FATAL, "HSA copy not supported on NVIDIA hardware"};
#else
// Use HSA async copy
do {
hsa_signal_store_screlease(resources.signal, 1);
if (!useSubIndices) {
ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[0],
resources.dstAgent,
resources.srcMem[0],
resources.srcAgent,
resources.numBytes,
0,
NULL,
resources.signal));
} else {
HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[0],
resources.dstAgent,
resources.srcMem[0],
resources.srcAgent,
resources.numBytes,
0,
NULL,
resources.signal,
resources.sdmaEngineId,
true));
}
// Wait for SDMA transfer to complete
while (hsa_signal_wait_scacquire(resources.signal,
HSA_SIGNAL_CONDITION_LT,
1,
UINT64_MAX,
HSA_WAIT_STATE_ACTIVE) >= 1);
} while (++subIterations != cfg.general.numSubIterations);
#endif
}
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
.count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) {
double deltaMsec = cpuDeltaMsec;
if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
float gpuDeltaMsec;
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
}
resources.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { resources.perIterMsec.push_back(deltaMsec); }
}
return ERR_NONE;
}
// Execute a single DMA executor
static ErrResult RunDmaExecutor(int const iteration,
ConfigOptions const& cfg,
int const exeIndex,
ExeInfo& exeInfo)
{
auto cpuStart = std::chrono::high_resolution_clock::now();
ERR_CHECK(hipSetDevice(exeIndex));
vector<std::future<ErrResult>> asyncTransfers;
for (auto i = std::size_t(0); i < exeInfo.resources.size(); i++) {
asyncTransfers.emplace_back(std::async(std::launch::async,
ExecuteDmaTransfer,
iteration,
exeInfo.useSubIndices,
exeInfo.streams[i],
cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL,
cfg.dma.useHipEvents ? exeInfo.stopEvents[i] : NULL,
std::cref(cfg),
std::ref(exeInfo.resources[i])));
}
for (auto& asyncTransfer : asyncTransfers) { ERR_CHECK(asyncTransfer.get()); }
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
.count() * 1000.0 / cfg.general.numSubIterations;
if (iteration >= 0) { exeInfo.totalDurationMsec += deltaMsec; }
return ERR_NONE;
}
// Executor-related functions
//========================================================================================
static ErrResult RunExecutor(int const iteration,
ConfigOptions const& cfg,
ExeDevice const& exeDevice,
ExeInfo& exeInfo)
{
switch (exeDevice.exeType) {
case EXE_CPU: return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
#ifdef NIC_EXEC_ENABLED
case EXE_NIC: return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
#endif
default: return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
}
}
} // End of anonymous namespace
//========================================================================================
/// @endcond
ErrResult::ErrResult(ErrType err) : errType(err), errMsg(""){};
ErrResult::ErrResult(hipError_t err)
{
if (err == hipSuccess) {
this->errType = ERR_NONE;
this->errMsg = "";
} else {
this->errType = ERR_FATAL;
this->errMsg = std::string("HIP Error: ") + hipGetErrorString(err);
}
}
#if !defined(__NVCC__)
ErrResult::ErrResult(hsa_status_t err)
{
if (err == HSA_STATUS_SUCCESS) {
this->errType = ERR_NONE;
this->errMsg = "";
} else {
const char* errString = NULL;
hsa_status_string(err, &errString);
this->errType = ERR_FATAL;
this->errMsg = std::string("HSA Error: ") + errString;
}
}
#endif
ErrResult::ErrResult(ErrType errType, const char* format, ...)
{
this->errType = errType;
va_list args, args_temp;
va_start(args, format);
va_copy(args_temp, args);
int len = vsnprintf(nullptr, 0, format, args);
if (len < 0) {
va_end(args_temp);
va_end(args);
} else {
this->errMsg.resize(len);
vsnprintf(this->errMsg.data(), len + 1, format, args_temp);
}
va_end(args_temp);
va_end(args);
}
bool RunTransfers(ConfigOptions const& cfg,
std::vector<Transfer> const& transfers,
TestResults& results)
{
// Clear all errors;
auto& errResults = results.errResults;
errResults.clear();
// Check for valid configuration
if (ConfigOptionsHaveErrors(cfg, errResults)) { return false; }
// Check for valid transfers
if (TransfersHaveErrors(cfg, transfers, errResults)) { return false; }
// Collect up transfers by executor
int minNumSrcs = MAX_SRCS + 1;
int maxNumSrcs = 0;
size_t maxNumBytes = 0;
std::map<ExeDevice, ExeInfo> executorMap;
for (auto i = std::size_t(0); i < transfers.size(); i++) {
Transfer const& t = transfers[i];
ExeDevice exeDevice;
ERR_APPEND(GetActualExecutor(cfg, t.exeDevice, exeDevice), errResults);
TransferResources resource = {};
resource.transferIdx = i;
ExeInfo& exeInfo = executorMap[exeDevice];
exeInfo.totalBytes += t.numBytes;
exeInfo.totalSubExecs += t.numSubExecs;
exeInfo.useSubIndices |= (t.exeSubIndex != -1 || (t.exeDevice.exeType == EXE_GPU_GFX &&
!cfg.gfx.prefXccTable.empty()));
exeInfo.resources.push_back(resource);
minNumSrcs = std::min(minNumSrcs, (int)t.srcs.size());
maxNumSrcs = std::max(maxNumSrcs, (int)t.srcs.size());
maxNumBytes = std::max(maxNumBytes, t.numBytes);
}
// Loop over each executor and prepare
// - Allocates memory for each Transfer
// - Set up work for subexecutors
vector<TransferResources*> transferResources;
for (auto& exeInfoPair : executorMap) {
ExeDevice const& exeDevice = exeInfoPair.first;
ExeInfo& exeInfo = exeInfoPair.second;
ERR_APPEND(PrepareExecutor(cfg, transfers, exeDevice, exeInfo), errResults);
for (auto& resource : exeInfo.resources) { transferResources.push_back(&resource); }
}
// Prepare reference src/dst arrays - only once for largest size
size_t maxN = maxNumBytes / sizeof(float);
vector<float> outputBuffer(maxN);
vector<vector<float>> dstReference(maxNumSrcs + 1, vector<float>(maxN));
{
size_t initOffset = cfg.data.byteOffset / sizeof(float);
vector<vector<float>> srcReference(maxNumSrcs, vector<float>(maxN));
memset(dstReference[0].data(), MEMSET_CHAR, maxNumBytes);
for (int numSrcs = 0; numSrcs < maxNumSrcs; numSrcs++) {
PrepareReference(cfg, srcReference[numSrcs], numSrcs);
for (auto i = std::size_t(0); i < maxN; i++) {
dstReference[numSrcs + 1][i] = (numSrcs == 0 ? 0 : dstReference[numSrcs][i]) +
srcReference[numSrcs][i];
}
}
// Release un-used partial sums
for (int numSrcs = 0; numSrcs < minNumSrcs; numSrcs++) { dstReference[numSrcs].clear(); }
// Initialize all src memory buffers
for (auto resource : transferResources) {
for (auto srcIdx = std::size_t(0); srcIdx < resource->srcMem.size(); srcIdx++) {
ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx] + initOffset,
srcReference[srcIdx].data(),
resource->numBytes,
hipMemcpyDefault),
errResults);
}
}
}
// Pause before starting when running in iteractive mode
if (cfg.general.useInteractive) {
printf("Memory prepared:\n");
for (auto i = std::size_t(0); i < transfers.size(); i++) {
ExeInfo const& exeInfo = executorMap[transfers[i].exeDevice];
printf("Transfer %03zu:\n", i);
for (auto iSrc = std::size_t(0); iSrc < transfers[i].srcs.size(); ++iSrc) {
printf(" SRC %0zu: %p\n",
iSrc,
static_cast<void*>(transferResources[i]->srcMem[iSrc]));
}
for (auto iDst = std::size_t(0); iDst < transfers[i].dsts.size(); ++iDst) {
printf(" DST %0zu: %p\n",
iDst,
static_cast<void*>(transferResources[i]->dstMem[iDst]));
}
}
printf("Hit <Enter> to continue: ");
if (scanf("%*c") != 0) {
printf("[ERROR] Unexpected input\n");
return EXIT_FAILURE;
}
printf("\n");
}
// Perform iterations
size_t numTimedIterations = 0;
double totalCpuTimeSec = 0.0;
for (int iteration = -cfg.general.numWarmups;; iteration++) {
// Stop if number of iterations/seconds has reached limit
if (cfg.general.numIterations > 0 && iteration >= cfg.general.numIterations) { break; }
if (cfg.general.numIterations < 0 && totalCpuTimeSec > -cfg.general.numIterations) {
break;
}
// Start CPU timing for this iteration
auto cpuStart = std::chrono::high_resolution_clock::now();
// Execute all Transfers in parallel
std::vector<std::future<ErrResult>> asyncExecutors;
for (auto& exeInfoPair : executorMap) {
asyncExecutors.emplace_back(std::async(std::launch::async,
RunExecutor,
iteration,
std::cref(cfg),
std::cref(exeInfoPair.first),
std::ref(exeInfoPair.second)));
}
// Wait for all threads to finish
for (auto& asyncExecutor : asyncExecutors) { ERR_APPEND(asyncExecutor.get(), errResults); }
// Stop CPU timing for this iteration
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
.count() /
cfg.general.numSubIterations;
if (cfg.data.alwaysValidate) {
ERR_APPEND(
ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
errResults);
}
if (iteration >= 0) {
++numTimedIterations;
totalCpuTimeSec += deltaSec;
}
}
// Pause for interactive mode
if (cfg.general.useInteractive) {
printf("Transfers complete. Hit <Enter> to continue: ");
if (scanf("%*c") != 0) {
printf("[ERROR] Unexpected input\n");
return EXIT_FAILURE;
}
printf("\n");
}
// Validate results
if (!cfg.data.alwaysValidate) {
ERR_APPEND(
ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
errResults);
}
// Prepare results
results.exeResults.clear();
results.tfrResults.clear();
results.tfrResults.resize(transfers.size());
results.numTimedIterations = numTimedIterations;
results.totalBytesTransferred = 0;
results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / numTimedIterations;
results.overheadMsec = results.avgTotalDurationMsec;
for (auto& exeInfoPair : executorMap) {
ExeDevice const& exeDevice = exeInfoPair.first;
ExeInfo& exeInfo = exeInfoPair.second;
// Copy over executor results
ExeResult& exeResult = results.exeResults[exeDevice];
exeResult.numBytes = exeInfo.totalBytes;
exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations;
exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec;
exeResult.sumBandwidthGbPerSec = 0.0;
exeResult.transferIdx.clear();
results.totalBytesTransferred += exeInfo.totalBytes;
results.overheadMsec = std::min(results.overheadMsec,
(results.avgTotalDurationMsec - exeResult.avgDurationMsec));
// Copy over transfer results
for (auto const& rss : exeInfo.resources) {
int const transferIdx = rss.transferIdx;
exeResult.transferIdx.push_back(transferIdx);
TransferResult& tfrResult = results.tfrResults[transferIdx];
tfrResult.exeDevice = exeDevice;
#ifdef NIC_EXEC_ENABLED
tfrResult.exeDstDevice = {exeDevice.exeType, rss.dstNicIndex};
#else
tfrResult.exeDstDevice = exeDevice;
#endif
tfrResult.numBytes = rss.numBytes;
tfrResult.avgDurationMsec = rss.totalDurationMsec / numTimedIterations;
tfrResult.avgBandwidthGbPerSec = (rss.numBytes / 1.0e6) / tfrResult.avgDurationMsec;
if (cfg.general.recordPerIteration) {
tfrResult.perIterMsec = rss.perIterMsec;
tfrResult.perIterCUs = rss.perIterCUs;
}
exeResult.sumBandwidthGbPerSec += tfrResult.avgBandwidthGbPerSec;
}
}
results.avgTotalBandwidthGbPerSec = (results.totalBytesTransferred / 1.0e6) /
results.avgTotalDurationMsec;
// Teardown executors
for (auto& exeInfoPair : executorMap) {
ExeDevice const& exeDevice = exeInfoPair.first;
ExeInfo& exeInfo = exeInfoPair.second;
ERR_APPEND(TeardownExecutor(cfg, exeDevice, transfers, exeInfo), errResults);
}
return true;
}
int GetIntAttribute(IntAttribute attribute)
{
switch (attribute) {
case ATR_GFX_MAX_BLOCKSIZE: return MAX_BLOCKSIZE;
case ATR_GFX_MAX_UNROLL: return MAX_UNROLL;
default: return -1;
}
}
std::string GetStrAttribute(StrAttribute attribute)
{
switch (attribute) {
case ATR_SRC_PREP_DESCRIPTION:
return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
default: return "";
}
}
ErrResult ParseTransfers(std::string line, std::vector<Transfer>& transfers)
{
// Replace any round brackets or '->' with spaces,
for (int i = 1; line[i]; i++) {
if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == ':' ||
line[i] == '>') {
line[i] = ' ';
}
}
transfers.clear();
// Read in number of transfers
int numTransfers = 0;
std::istringstream iss(line);
iss >> numTransfers;
if (iss.fail()) { return ERR_NONE; }
// If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
// otherwise read triples (srcMem, exeMem, dstMem)
bool const advancedMode = (numTransfers < 0);
numTransfers = abs(numTransfers);
int numSubExecs;
std::string srcStr, exeStr, dstStr, numBytesToken;
if (!advancedMode) {
iss >> numSubExecs;
if (numSubExecs < 0 || iss.fail()) {
return {ERR_FATAL,
"Parsing error: Number of blocks to use (%d) must be non-negative",
numSubExecs};
}
}
for (int i = 0; i < numTransfers; i++) {
Transfer transfer;
if (!advancedMode) {
iss >> srcStr >> exeStr >> dstStr;
transfer.numSubExecs = numSubExecs;
if (iss.fail()) {
return {ERR_FATAL,
"Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet",
i + 1};
}
transfer.numBytes = 0;
} else {
iss >> srcStr >> exeStr >> dstStr >> transfer.numSubExecs >> numBytesToken;
if (iss.fail()) {
return {ERR_FATAL,
"Parsing error: Unable to read valid Transfer %d (SRC EXE DST $CU #Bytes) "
"tuple",
i + 1};
}
if (sscanf(numBytesToken.c_str(), "%lu", &transfer.numBytes) != 1) {
return {ERR_FATAL,
"Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) "
"tuple",
i + 1};
}
char units = numBytesToken.back();
switch (toupper(units)) {
case 'G': transfer.numBytes *= 1024;
case 'M': transfer.numBytes *= 1024;
case 'K': transfer.numBytes *= 1024;
}
}
ERR_CHECK(ParseMemType(srcStr, transfer.srcs));
ERR_CHECK(ParseMemType(dstStr, transfer.dsts));
ERR_CHECK(ParseExeType(exeStr, transfer.exeDevice, transfer.exeSubIndex));
transfers.push_back(transfer);
}
return ERR_NONE;
}
auto GetTransferBenchVersion() -> const std::string
{
auto tb_version = std::string(TB_HEADER_VERSION);
if (tb_version.empty()) { tb_version = std::string(TB_UNKNOWN_VERSION); }
return tb_version;
}
auto GetTransferBenchBranch() -> const std::string
{
auto tb_branch = std::string(TB_GIT_BRANCH);
if (tb_branch.empty()) {
tb_branch = std::string(TB_UNKNOWN_VERSION);
return tb_branch;
}
return tb_branch;
}
auto GetTransferBenchCommitHash([[maybe_unused]] bool is_long_commit) -> const std::string
{
constexpr auto TB_GIT_SHORT_COMMIT_SIZE = std::size_t(8);
auto tb_commit = std::string(TB_GIT_COMMIT);
if (tb_commit.empty()) {
tb_commit = std::string(TB_UNKNOWN_VERSION);
return tb_commit;
}
if (!is_long_commit && tb_commit.length() >= TB_GIT_SHORT_COMMIT_SIZE) {
tb_commit = tb_commit.substr(0, (TB_GIT_SHORT_COMMIT_SIZE - 1));
}
return tb_commit;
}
int GetNumExecutors(ExeType exeType)
{
switch (exeType) {
case EXE_CPU: return numa_num_configured_nodes();
case EXE_GPU_GFX:
case EXE_GPU_DMA: {
int numDetectedGpus = 0;
hipError_t status = hipGetDeviceCount(&numDetectedGpus);
if (status != hipSuccess) { numDetectedGpus = 0; }
return numDetectedGpus;
}
#ifdef NIC_EXEC_ENABLED
case EXE_NIC:
case EXE_NIC_NEAREST: {
return GetIbvDeviceList().size();
}
#endif
default: return 0;
}
}
int GetNumSubExecutors(ExeDevice exeDevice)
{
int const& exeIndex = exeDevice.exeIndex;
switch (exeDevice.exeType) {
case EXE_CPU: {
int numCores = 0;
for (int i = 0; i < numa_num_configured_cpus(); i++) {
if (numa_node_of_cpu(i) == exeIndex) { numCores++; }
}
return numCores;
}
case EXE_GPU_GFX: {
int numGpus = GetNumExecutors(EXE_GPU_GFX);
if (exeIndex < 0 || numGpus <= exeIndex) { return 0; }
int numDeviceCUs = 0;
hipError_t status = hipDeviceGetAttribute(
&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex);
if (status != hipSuccess) { numDeviceCUs = 0; }
return numDeviceCUs;
}
case EXE_GPU_DMA: {
return 1;
}
default: return 0;
}
}
int GetNumExecutorSubIndices(ExeDevice exeDevice)
{
// Executor subindices are not supported on NVIDIA hardware
#if defined(__NVCC__)
return 0;
#else
int const& exeIndex = exeDevice.exeIndex;
switch (exeDevice.exeType) {
case EXE_CPU: return 0;
case EXE_GPU_GFX: {
hsa_agent_t agent;
ErrResult err = GetHsaAgent(exeDevice, agent);
if (err.errType != ERR_NONE) { return 0; }
int numXccs = 1;
if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_XCC, &numXccs) !=
HSA_STATUS_SUCCESS) {
return 1;
}
return numXccs;
}
case EXE_GPU_DMA: {
std::set<int> engineIds;
ErrResult err;
// Get HSA agent for this GPU
hsa_agent_t agent;
err = GetHsaAgent(exeDevice, agent);
if (err.errType != ERR_NONE) { return 0; }
int numTotalEngines = 0, numEnginesA = 0, numEnginesB = 0;
if (hsa_agent_get_info(agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_ENG,
&numEnginesA) == HSA_STATUS_SUCCESS) {
numTotalEngines += numEnginesA;
}
if (hsa_agent_get_info(agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG,
&numEnginesB) == HSA_STATUS_SUCCESS) {
numTotalEngines += numEnginesB;
}
return numTotalEngines;
}
default: return 0;
}
#endif
}
int GetClosestCpuNumaToGpu(int gpuIndex)
{
// Closest NUMA is not supported on NVIDIA hardware at this time
#if defined(__NVCC__)
return -1;
#else
hsa_agent_t gpuAgent;
ErrResult err = GetHsaAgent({EXE_GPU_GFX, gpuIndex}, gpuAgent);
if (err.errType != ERR_NONE) { return -1; }
hsa_agent_t closestCpuAgent;
if (hsa_agent_get_info(gpuAgent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU,
&closestCpuAgent) == HSA_STATUS_SUCCESS) {
int numCpus = GetNumExecutors(EXE_CPU);
for (int i = 0; i < numCpus; i++) {
hsa_agent_t cpuAgent;
err = GetHsaAgent({EXE_CPU, i}, cpuAgent);
if (err.errType != ERR_NONE) { return -1; }
if (cpuAgent.handle == closestCpuAgent.handle) { return i; }
}
}
return -1;
#endif
}
int GetClosestCpuNumaToNic([[maybe_unused]] int nicIndex)
{
#ifdef NIC_EXEC_ENABLED
int numNics = GetNumExecutors(EXE_NIC);
if (nicIndex < 0 || nicIndex >= numNics) { return -1; }
return GetIbvDeviceList()[nicIndex].numaNode;
#else
return -1;
#endif
}
int GetClosestNicToGpu([[maybe_unused]] int gpuIndex)
{
#ifdef NIC_EXEC_ENABLED
static bool isInitialized = false;
static std::vector<int> closestNicId;
int numGpus = GetNumExecutors(EXE_GPU_GFX);
if (gpuIndex < 0 || gpuIndex >= numGpus) { return -1; }
// Build closest NICs per GPU on first use
if (!isInitialized) {
closestNicId.resize(numGpus, -1);
// Build up list of NIC bus addresses
std::vector<std::string> ibvAddressList;
auto const& ibvDeviceList = GetIbvDeviceList();
for (auto const& ibvDevice : ibvDeviceList) {
ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : "");
}
// Track how many times a device has been assigned as "closest"
// This allows distributed work across devices using multiple ports (sharing the same busID)
// NOTE: This isn't necessarily optimal, but likely to work in most cases involving
// multi-port Counter example:
//
// G0 prefers (N0,N1), picks N0
// G1 prefers (N1,N2), picks N1
// G2 prefers N0, picks N0
//
// instead of G0->N1, G1->N2, G2->N0
std::vector<int> assignedCount(ibvDeviceList.size(), 0);
// Loop over each GPU to find the closest NIC(s) based on PCIe address
for (int i = 0; i < numGpus; i++) {
// Collect PCIe address for the GPU
char hipPciBusId[64];
hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i);
if (err != hipSuccess) {
#ifdef VERBS_DEBUG
printf(
"Failed to get PCI Bus ID for HIP device %d: %s\n", i, hipGetErrorString(err));
#endif
closestNicId[i] = -1;
continue;
}
// Find closest NICs
std::set<int> closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList);
// Pick the least-used NIC to assign as closest
int closestIdx = -1;
for (auto idx : closestNicIdxs) {
if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx]) {
closestIdx = idx;
}
}
// The following will only use distance between bus IDs
// to determine the closest NIC to GPU if the PCIe tree approach fails
if (closestIdx < 0) {
#ifdef VERBS_DEBUG
printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n");
#endif
int minDistance = std::numeric_limits<int>::max();
for (int j = 0; j < ibvDeviceList.size(); j++) {
if (ibvDeviceList[j].busId != "") {
int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[j].busId);
if (distance < minDistance && distance >= 0) {
minDistance = distance;
closestIdx = j;
}
}
}
}
closestNicId[i] = closestIdx;
if (closestIdx != -1) { assignedCount[closestIdx]++; }
}
isInitialized = true;
}
return closestNicId[gpuIndex];
#else
return -1;
#endif
}
// Undefine CUDA compatibility macros
#if defined(__NVCC__)
// clang-format off
// ROCm specific
#undef wall_clock64
#undef gcnArchName
// Datatypes
#undef hipDeviceProp_t
#undef hipError_t
#undef hipEvent_t
#undef hipStream_t
// Enumerations
#undef hipDeviceAttributeClockRate
#undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
#undef hipDeviceAttributeMultiprocessorCount
#undef hipErrorPeerAccessAlreadyEnabled
#undef hipFuncCachePreferShared
#undef hipMemcpyDefault
#undef hipMemcpyDeviceToHost
#undef hipMemcpyHostToDevice
#undef hipSuccess
// Functions
#undef hipDeviceCanAccessPeer
#undef hipDeviceEnablePeerAccess
#undef hipDeviceGetAttribute
#undef hipDeviceGetPCIBusId
#undef hipDeviceSetCacheConfig
#undef hipDeviceSynchronize
#undef hipEventCreate
#undef hipEventDestroy
#undef hipEventElapsedTime
#undef hipEventRecord
#undef hipFree
#undef hipGetDeviceCount
#undef hipGetDeviceProperties
#undef hipGetErrorString
#undef hipHostFree
#undef hipHostMalloc
#undef hipMalloc
#undef hipMallocManaged
#undef hipMemcpy
#undef hipMemcpyAsync
#undef hipMemset
#undef hipMemsetAsync
#undef hipSetDevice
#undef hipStreamCreate
#undef hipStreamDestroy
#undef hipStreamSynchronize
#endif
// Kernel macros
#undef GetHwId
#undef GetXccId
// Undefine helper macros
#undef ERR_CHECK
#undef ERR_APPEND
// clang-format on
} // namespace TransferBench
/*
* SPDX-License-Identifier: MIT License
*
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
/**
* Note: This file is used by shared/static library builds.
* If parts of the implementation cannot be in the header (e.g., non-inline functions,
* non-template code, or code that must avoid being exposed to users), we can place that
* code here.
*
*/
#define TRANSFERBENCH_HEADER_IMPLEMENTATION_DETAILS
#include <TransferBench.hpp>
/*
Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "Client.hpp"
#include "Presets.hpp"
#include "Topology.hpp"
#include <fstream>
int main(int argc, char **argv) {
// Collect environment variables
EnvVars ev;
// Display usage instructions and detected topology
if (argc <= 1) {
if (!ev.outputToCsv) {
DisplayUsage(argv[0]);
DisplayPresets();
}
DisplayTopology(ev.outputToCsv);
exit(0);
}
// Determine number of bytes to run per Transfer
size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
if (argc > 2) {
// Adjust bytes if unit specified
char units = argv[2][strlen(argv[2])-1];
switch (units) {
case 'G': case 'g': numBytesPerTransfer *= 1024;
case 'M': case 'm': numBytesPerTransfer *= 1024;
case 'K': case 'k': numBytesPerTransfer *= 1024;
}
}
if (numBytesPerTransfer % 4) {
printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
exit(1);
}
// Run preset benchmark if requested
if (RunPreset(ev, numBytesPerTransfer, argc, argv)) exit(0);
// Read input from command line or configuration file
std::vector<std::string> lines;
{
std::string line;
if (!strcmp(argv[1], "cmdline")) {
for (int i = 3; i < argc; i++)
line += std::string(argv[i]) + " ";
lines.push_back(line);
} else {
std::ifstream cfgFile(argv[1]);
if (!cfgFile.is_open()) {
printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
exit(1);
}
while (std::getline(cfgFile, line))
lines.push_back(line);
cfgFile.close();
}
}
// Print environment variables and CSV header
ev.DisplayEnvVars();
if (ev.outputToCsv)
printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
TransferBench::ConfigOptions cfgOptions = ev.ToConfigOptions();
TransferBench::TestResults results;
std::vector<ErrResult> errors;
// Process each line as a Test
int testNum = 0;
for (std::string const &line : lines) {
// Check if line is a comment to be echoed to output (starts with ##)
if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s\n", line.c_str());
// Parse set of parallel Transfers to execute
std::vector<Transfer> transfers;
CheckForError(TransferBench::ParseTransfers(line, transfers));
if (transfers.empty()) continue;
// Check for variable sub-executors Transfers
int numVariableTransfers = 0;
int maxVarCount = 0;
{
std::map<ExeDevice, int> varTransferCount;
for (auto const& t : transfers) {
if (t.numSubExecs == 0) {
if (t.exeDevice.exeType != EXE_GPU_GFX) {
printf("[ERROR] Variable number of subexecutors is only supported on GFX executors\n");
exit(1);
}
numVariableTransfers++;
varTransferCount[t.exeDevice]++;
maxVarCount = max(maxVarCount, varTransferCount[t.exeDevice]);
}
}
if (numVariableTransfers > 0 && numVariableTransfers != transfers.size()) {
printf("[ERROR] All or none of the Transfers in the Test must use variable number of Subexecutors\n");
exit(1);
}
}
// Track which transfers have already numBytes specified
std::vector<bool> bytesSpecified(transfers.size());
int hasUnspecified = false;
for (int i = 0; i < transfers.size(); i++) {
bytesSpecified[i] = (transfers[i].numBytes != 0);
if (transfers[i].numBytes == 0) hasUnspecified = true;
}
// Run the specified numbers of bytes otherwise generate a range of values
for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
do {
for (int i = 0; i < transfers.size(); i++) {
if (!bytesSpecified[i])
transfers[i].numBytes = currBytes;
}
if (maxVarCount == 0) {
if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
PrintResults(ev, ++testNum, transfers, results);
}
PrintErrors(results.errResults);
} else {
// Variable subexecutors - Determine how many subexecutors to sweep up to
int maxNumVarSubExec = ev.maxNumVarSubExec;
if (maxNumVarSubExec == 0) {
maxNumVarSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}) / maxVarCount;
}
TransferBench::TestResults bestResults;
std::vector<Transfer> bestTransfers;
for (int numSubExecs = ev.minNumVarSubExec; numSubExecs <= maxNumVarSubExec; numSubExecs++) {
std::vector<Transfer> tempTransfers = transfers;
for (auto& t : tempTransfers) {
if (t.numSubExecs == 0) t.numSubExecs = numSubExecs;
}
TransferBench::TestResults tempResults;
if (!TransferBench::RunTransfers(cfgOptions, tempTransfers, tempResults)) {
PrintErrors(tempResults.errResults);
} else {
if (tempResults.avgTotalBandwidthGbPerSec > bestResults.avgTotalBandwidthGbPerSec) {
bestResults = tempResults;
bestTransfers = tempTransfers;
}
}
}
PrintResults(ev, ++testNum, bestTransfers, bestResults);
PrintErrors(bestResults.errResults);
}
if (numBytesPerTransfer != 0 || !hasUnspecified) break;
currBytes += deltaBytes;
} while (currBytes < bytes * 2);
if (numBytesPerTransfer != 0 || !hasUnspecified) break;
}
}
}
void DisplayUsage(char const* cmdName)
{
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
printf("TransferBench v%s.%s%s\n", TransferBench::VERSION, CLIENT_VERSION, nicSupport.c_str());
printf("========================================\n");
if (numa_available() == -1) {
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
printf("Usage: %s config <N>\n", cmdName);
printf(" config: Either:\n");
printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
printf(" - Name of preset config:\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
printf(" If 0 is specified, a range of Ns will be benchmarked\n");
printf(" May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
printf("\n");
EnvVars::DisplayUsage();
}
std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices) {
if (memDevices.empty()) return "N";
std::stringstream ss;
for (auto const& m : memDevices)
ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
return ss.str();
}
void PrintResults(EnvVars const& ev, int const testNum,
std::vector<Transfer> const& transfers,
TransferBench::TestResults const& results)
{
char sep = ev.outputToCsv ? ',' : '|';
size_t numTimedIterations = results.numTimedIterations;
if (!ev.outputToCsv) printf("Test %d:\n", testNum);
// Loop over each executor
for (auto exeInfoPair : results.exeResults) {
ExeDevice const& exeDevice = exeInfoPair.first;
ExeResult const& exeResult = exeInfoPair.second;
ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex;
printf(" Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);
// Loop over each executor
for (int idx : exeResult.transferIdx) {
Transfer const& t = transfers[idx];
TransferResult const& r = results.tfrResults[idx];
char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1)
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
printf(" Transfer %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %c%03d%s:%03d -> %s\n",
idx, sep,
r.avgBandwidthGbPerSec, sep,
r.avgDurationMsec, sep,
r.numBytes, sep,
MemDevicesToStr(t.srcs).c_str(),
TransferBench::ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
// Show per-iteration timing information
if (ev.showIterations) {
// Check that per-iteration information exists
if (r.perIterMsec.size() != numTimedIterations) {
printf("[ERROR] Per iteration timing data unavailable: Expected %lu data points, but have %lu\n",
numTimedIterations, r.perIterMsec.size());
exit(1);
}
// Compute standard deviation and track iterations by speed
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++) {
times.insert(std::make_pair(r.perIterMsec[i], i+1));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
// Loop over iterations (fastest to slowest)
for (auto& time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %8.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
std::set<int> usedXccs;
if (time.second - 1 < r.perIterCUs.size()) {
printf(" CUs:");
for (auto x : r.perIterCUs[time.second - 1]) {
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev %c %8.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
}
}
}
printf(" Aggregate (CPU) %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
sep, results.avgTotalBandwidthGbPerSec,
sep, results.avgTotalDurationMsec,
sep, results.totalBytesTransferred,
sep, results.overheadMsec);
}
void CheckForError(ErrResult const& error)
{
switch (error.errType) {
case ERR_NONE: return;
case ERR_WARN:
printf("[WARN] %s\n", error.errMsg.c_str());
return;
case ERR_FATAL:
printf("[ERROR] %s\n", error.errMsg.c_str());
exit(1);
default:
break;
}
}
void PrintErrors(std::vector<ErrResult> const& errors)
{
bool isFatal = false;
for (auto const& err : errors) {
printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
isFatal |= (err.errType == ERR_FATAL);
}
if (isFatal) exit(1);
}
/*
Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef ENVVARS_HPP
#define ENVVARS_HPP
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) { \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \
<< ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#include <algorithm>
#include <iostream>
#include <numa.h>
#include <random>
#include <time.h>
#include "Client.hpp"
#include "TransferBench.hpp"
using namespace TransferBench;
// Redefinitions for CUDA compatibility
//==========================================================================================
#if defined(__NVCC__)
#define hipError_t cudaError_t
#define hipGetErrorString cudaGetErrorString
#define hipDeviceProp_t cudaDeviceProp
#define hipDeviceGetPCIBusId cudaDeviceGetPCIBusId
#define hipGetDeviceProperties cudaGetDeviceProperties
#define hipSuccess cudaSuccess
#define gcnArchName name
#define hipGetDeviceCount cudaGetDeviceCount
#endif
// This class manages environment variable that affect TransferBench
class EnvVars
{
public:
// Default configuration values
int const DEFAULT_SAMPLING_FACTOR = 1;
// Environment variables
// General options
int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead
int numSubIterations; // Number of subiterations to perform
int numWarmups; // Number of un-timed warmup iterations to perform
int showIterations; // Show per-iteration timing info
int useInteractive; // Pause for user-input before starting transfer loop
// Data options
int alwaysValidate; // Validate after each iteration instead of once after all iterations
int blockBytes; // Each subexecutor, except the last, gets a multiple of this many bytes to copy
int byteOffset; // Byte-offset for memory allocations
vector<float> fillPattern; // Pattern of floats used to fill source data
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
int validateSource; // Validate source GPU memory immediately after preparation
// DMA options
int useHsaDma; // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
// GFX options
int gfxBlockOrder; // How threadblocks for multiple Transfers are ordered 0=sequential 1=interleaved
int gfxBlockSize; // Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask; // Bit-vector representing the CU mask
vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair
int gfxTemporal; // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both)
int gfxUnroll; // GFX-kernel unroll factor
int useHipEvents; // Use HIP events for timing GFX/DMA Executor
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
int gfxSingleTeam; // Team all subExecutors across the data array
int gfxWaveOrder; // GFX-kernel wavefront ordering
int gfxWordSize; // GFX-kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)
// Client options
int hideEnv; // Skip printing environment variable
int minNumVarSubExec; // Minimum # of subexecutors to use for variable subExec Transfers
int maxNumVarSubExec; // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
int outputToCsv; // Output in CSV format
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
// NIC options
int ibGidIndex; // GID Index for RoCE NICs
int roceVersion; // RoCE version number
int ipAddressFamily; // IP Address Famliy
uint8_t ibPort; // NIC port number to be used
int nicRelaxedOrder; // Use relaxed ordering for RDMA
std::string closestNicStr; // Holds the user-specified list of closest NICs
// Developer features
int gpuMaxHwQueues; // Tracks GPU_MAX_HW_QUEUES environment variable
// Constructor that collects values
EnvVars()
{
int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
int numDeviceCUs = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0});
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, 0));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
// Different hardware pick different GPU kernels
// This performance difference is generally only noticable when executing fewer CUs
int defaultGfxUnroll = 4;
if (archName == "gfx906") defaultGfxUnroll = 8;
else if (archName == "gfx90a") defaultGfxUnroll = 8;
else if (archName == "gfx940") defaultGfxUnroll = 6;
else if (archName == "gfx941") defaultGfxUnroll = 6;
else if (archName == "gfx942") defaultGfxUnroll = 4;
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
gfxTemporal = GetEnvVar("GFX_TEMPORAL" , 0);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
gfxWordSize = GetEnvVar("GFX_WORD_SIZE" , 4);
hideEnv = GetEnvVar("HIDE_ENV" , 0);
minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC" , 1);
maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC" , 0);
numIterations = GetEnvVar("NUM_ITERATIONS" , 10);
numSubIterations = GetEnvVar("NUM_SUBITERATIONS" , 1);
numWarmups = GetEnvVar("NUM_WARMUPS" , 3);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , 1);
showIterations = GetEnvVar("SHOW_ITERATIONS" , 0);
useHipEvents = GetEnvVar("USE_HIP_EVENTS" , 1);
useHsaDma = GetEnvVar("USE_HSA_DMA" , 0);
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 1);
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
validateSource = GetEnvVar("VALIDATE_SOURCE" , 0);
ibGidIndex = GetEnvVar("IB_GID_INDEX" ,-1);
ibPort = GetEnvVar("IB_PORT_NUMBER" , 1);
roceVersion = GetEnvVar("ROCE_VERSION" , 2);
ipAddressFamily = GetEnvVar("IP_ADDRESS_FAMILY" , 4);
nicRelaxedOrder = GetEnvVar("NIC_RELAX_ORDER" , 1);
closestNicStr = GetEnvVar("CLOSEST_NIC" , "");
gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES" , 4);
// Check for fill pattern
char* pattern = getenv("FILL_PATTERN");
if (pattern != NULL) {
int patternLen = strlen(pattern);
if (patternLen % 2) {
printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
exit(1);
}
// Read in bytes
std::vector<unsigned char> bytes;
unsigned char val = 0;
for (int i = 0; i < patternLen; i++) {
if ('0' <= pattern[i] && pattern[i] <= '9')
val += (pattern[i] - '0');
else if ('A' <= pattern[i] && pattern[i] <= 'F')
val += (pattern[i] - 'A' + 10);
else if ('a' <= pattern[i] && pattern[i] <= 'f')
val += (pattern[i] - 'a' + 10);
else {
printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F). (not %c)\n", pattern[i]);
exit(1);
}
if (i % 2 == 0)
val <<= 4;
else {
bytes.push_back(val);
val = 0;
}
}
// Reverse bytes (input is assumed to be given in big-endian)
std::reverse(bytes.begin(), bytes.end());
// Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
int copies;
switch (patternLen % 8) {
case 0: copies = 1; break;
case 4: copies = 2; break;
default: copies = 4; break;
}
// Fill floats
int numFloats = copies * patternLen / 8;
fillPattern.resize(numFloats);
unsigned char* rawData = (unsigned char*) fillPattern.data();
for (int i = 0; i < numFloats * 4; i++)
rawData[i] = bytes[i % bytes.size()];
}
else fillPattern.clear();
// Check for CU mask
int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
cuMask.clear();
char* cuMaskStr = getenv("CU_MASK");
if (cuMaskStr != NULL) {
#if defined(__NVCC__)
printf("[WARN] CU_MASK is not supported in CUDA\n");
#else
std::vector<std::pair<int, int>> ranges;
int maxCU = 0;
char* token = strtok(cuMaskStr, ",");
while (token) {
int start, end;
if (sscanf(token, "%d-%d", &start, &end) == 2) {
ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
maxCU = std::max(maxCU, std::max(start, end));
} else if (sscanf(token, "%d", &start) == 1) {
ranges.push_back(std::make_pair(start, start));
maxCU = std::max(maxCU, start);
} else {
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
cuMask.resize(2 * numXccs, 0);
for (auto range : ranges) {
for (int i = range.first; i <= range.second; i++) {
for (int x = 0; x < numXccs; x++) {
int targetBit = i * numXccs + x;
cuMask[targetBit/32] |= (1<<(targetBit%32));
}
}
}
#endif
}
// Parse preferred XCC table (if provided)
char* prefXccStr = getenv("XCC_PREF_TABLE");
if (prefXccStr) {
prefXccTable.resize(numDetectedGpus);
for (int i = 0; i < numDetectedGpus; i++){
prefXccTable[i].resize(numDetectedGpus, -1);
}
char* token = strtok(prefXccStr, ",");
int tokenCount = 0;
while (token) {
int xccId;
if (sscanf(token, "%d", &xccId) == 1) {
int src = tokenCount / numDetectedGpus;
int dst = tokenCount % numDetectedGpus;
if (xccId < 0 || xccId >= numXccs) {
printf("[ERROR] XCC index (%d) out of bounds. Expect value less than %d\n", xccId, numXccs);
exit(1);
}
prefXccTable[src][dst] = xccId;
tokenCount++;
if (tokenCount == (numDetectedGpus * numDetectedGpus)) break;
} else {
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
}
}
static std::string ToStr(std::vector<int> const& values) {
std::string result = "";
bool isFirst = true;
for (int v : values) {
if (isFirst) isFirst = false;
else result += ",";
result += std::to_string(v);
}
return result;
}
// Display info on the env vars that can be used
static void DisplayUsage()
{
printf("Environment variables:\n");
printf("======================\n");
printf(" ALWAYS_VALIDATE - Validate after each iteration instead of once after all iterations\n");
printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n");
#if NIC_EXEC_ENABLED
printf(" CLOSEST_NIC - Comma-separated list of per-GPU closest NIC (default=auto)\n");
#endif
printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" GFX_TEMPORAL - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" GFX_WORD_SIZE - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
#if NIC_EXEC_ENABLED
printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n");
printf(" IB_PORT_NUMBER - RDMA port count for RDMA NIC (default=1)\n");
printf(" IP_ADDRESS_FAMILY - IP address family (4=v4, 6=v6, default=v4)\n");
#endif
printf(" MIN_VAR_SUBEXEC - Minumum # of subexecutors to use for variable subExec Transfers\n");
printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
#if NIC_EXEC_ENABLED
printf(" NIC_RELAX_ORDER - Set to non-zero to use relaxed ordering");
#endif
printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n");
printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
printf(" NUM_WARMUPS - # of untimed warmup iterations per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
#if NIC_EXEC_ENABLED
printf(" ROCE_VERSION - RoCE version (default=2)\n");
#endif
printf(" SAMPLING_FACTOR - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info\n");
printf(" USE_HIP_EVENTS - Use HIP events for GFX executor timing\n");
printf(" USE_HSA_DMA - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n");
printf(" VALIDATE_SOURCE - Validate GPU src memory immediately after preparation\n");
}
void Print(std::string const& name, int32_t const value, const char* format, ...) const
{
printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : ");
va_list args;
va_start(args, format);
vprintf(format, args);
va_end(args);
printf("\n");
}
void Print(std::string const& name, std::string const& value, const char* format, ...) const
{
printf("%-20s%s%12s%s", name.c_str(), outputToCsv ? "," : " = ", value.c_str(), outputToCsv ? "," : " : ");
va_list args;
va_start(args, format);
vprintf(format, args);
va_end(args);
printf("\n");
}
// Display env var settings
void DisplayEnvVars() const
{
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
if (!outputToCsv) {
printf("TransferBench v%s.%s%s\n", TransferBench::VERSION, CLIENT_VERSION, nicSupport.c_str());
printf("===============================================================\n");
if (!hideEnv) printf("[Common] (Suppress by setting HIDE_ENV=1)\n");
}
else if (!hideEnv)
printf("EnvVar,Value,Description,(TransferBench Client v%s Backend v%s)\n", CLIENT_VERSION, TransferBench::VERSION);
if (hideEnv) return;
Print("ALWAYS_VALIDATE", alwaysValidate,
"Validating after %s", (alwaysValidate ? "each iteration" : "all iterations"));
Print("BLOCK_BYTES", blockBytes,
"Each CU gets a mulitple of %d bytes to copy", blockBytes);
Print("BYTE_OFFSET", byteOffset,
"Using byte offset of %d", byteOffset);
#if NIC_EXEC_ENABLED
Print("CLOSEST_NIC", (closestNicStr == "" ? "auto" : "user-input"),
"Per-GPU closest NIC is set as %s", (closestNicStr == "" ? "auto" : closestNicStr.c_str()));
#endif
Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
"%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
"%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
Print("GFX_BLOCK_ORDER", gfxBlockOrder,
"Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved");
Print("GFX_BLOCK_SIZE", gfxBlockSize,
"Threadblock size of %d", gfxBlockSize);
Print("GFX_SINGLE_TEAM", gfxSingleTeam,
"%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
"Each CUs operates on its own disjoint subarray"));
Print("GFX_TEMPORAL", gfxTemporal,
"%s", (gfxTemporal == 0 ? "Not using non-temporal loads/stores" :
gfxTemporal == 1 ? "Using non-temporal loads" :
gfxTemporal == 2 ? "Using non-temporal stores" :
"Using non-temporal loads and stores"));
Print("GFX_UNROLL", gfxUnroll,
"Using GFX unroll factor of %d", gfxUnroll);
Print("GFX_WAVE_ORDER", gfxWaveOrder,
"Using GFX wave ordering of %s", (gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll"));
Print("GFX_WORD_SIZE", gfxWordSize,
"Using GFX word size of %d (DWORDx%d)", gfxWordSize, gfxWordSize);
#if NIC_EXEC_ENABLED
Print("IP_ADDRESS_FAMILY", ipAddressFamily,
"IP address family is set to IPv%d", ipAddressFamily);
Print("IB_GID_INDEX", ibGidIndex,
"RoCE GID index is set to %s", (ibGidIndex < 0 ? "auto" : std::to_string(ibGidIndex).c_str()));
Print("IB_PORT_NUMBER", ibPort,
"IB port number is set to %d", ibPort);
#endif
Print("MIN_VAR_SUBEXEC", minNumVarSubExec,
"Using at least %d subexecutor(s) for variable subExec tranfers", minNumVarSubExec);
Print("MAX_VAR_SUBEXEC", maxNumVarSubExec,
"Using up to %s subexecutors for variable subExec transfers",
maxNumVarSubExec ? std::to_string(maxNumVarSubExec).c_str() : "all available");
#if NIC_EXEC_ENABLED
Print("NIC_RELAX_ORDER", nicRelaxedOrder,
"Using %s ordering for NIC RDMA", nicRelaxedOrder ? "relaxed" : "strict");
#endif
Print("NUM_ITERATIONS", numIterations,
(numIterations == 0) ? "Running infinitely" :
"Running %d %s", abs(numIterations), (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
Print("NUM_SUBITERATIONS", numSubIterations,
"Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str());
Print("NUM_WARMUPS", numWarmups,
"Running %d warmup iteration(s) per Test", numWarmups);
#if NIC_EXEC_ENABLED
Print("ROCE_VERSION", roceVersion,
"RoCE version is set to %d", roceVersion);
#endif
Print("SHOW_ITERATIONS", showIterations,
"%s per-iteration timing", showIterations ? "Showing" : "Hiding");
Print("USE_HIP_EVENTS", useHipEvents,
"Using %s for GFX/DMA Executor timing", useHipEvents ? "HIP events" : "CPU wall time");
Print("USE_HSA_DMA", useHsaDma,
"Using %s for DMA execution", useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync");
Print("USE_INTERACTIVE", useInteractive,
"Running in %s mode", useInteractive ? "interactive" : "non-interactive");
Print("USE_SINGLE_STREAM", useSingleStream,
"Using single stream per GFX %s", useSingleStream ? "device" : "Transfer");
if (getenv("XCC_PREF_TABLE")) {
printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
printf("%36s: ", "");
for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
for (int i = 0; i < numGpuDevices; i++) {
printf("%36s: GPU %3d ", "", i);
for (int j = 0; j < numGpuDevices; j++)
printf(" %3d", prefXccTable[i][j]);
printf(" %3d\n", TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}));
}
}
Print("VALIDATE_DIRECT", validateDirect,
"Validate GPU destination memory %s", validateDirect ? "directly" : "via CPU staging buffer");
Print("VALIDATE_SOURCE", validateSource,
validateSource ? "Validate source after preparation" : "Do not perform source validation after prep");
printf("\n");
};
// Helper function that gets parses environment variable or sets to default value
static int GetEnvVar(std::string const& varname, int defaultValue)
{
if (getenv(varname.c_str()))
return atoi(getenv(varname.c_str()));
return defaultValue;
}
static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
{
if (getenv(varname.c_str())) {
char* rangeStr = getenv(varname.c_str());
std::set<int> values;
char* token = strtok(rangeStr, ",");
while (token) {
int start, end;
if (sscanf(token, "%d-%d", &start, &end) == 2) {
for (int i = start; i <= end; i++) values.insert(i);
} else if (sscanf(token, "%d", &start) == 1) {
values.insert(start);
} else {
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
std::vector<int> result;
for (auto v : values) result.push_back(v);
return result;
}
return defaultValue;
}
static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
{
if (getenv(varname.c_str()))
return getenv(varname.c_str());
return defaultValue;
}
std::string GetCuMaskDesc() const
{
std::vector<std::pair<int, int>> runs;
int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
bool inRun = false;
std::pair<int, int> curr;
int used = 0;
for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
used++;
if (!inRun) {
inRun = true;
curr.first = targetBit / numXccs;
}
} else {
if (inRun) {
inRun = false;
curr.second = targetBit / numXccs - 1;
runs.push_back(curr);
}
}
}
if (inRun)
curr.second = (cuMask.size() * 32) / numXccs - 1;
std::string result = "CUs used: (" + std::to_string(used) + ") ";
for (int i = 0; i < runs.size(); i++)
{
if (i) result += ",";
if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
}
return result;
}
TransferBench::ConfigOptions ToConfigOptions()
{
TransferBench::ConfigOptions cfg;
cfg.general.numIterations = numIterations;
cfg.general.numSubIterations = numSubIterations;
cfg.general.numWarmups = numWarmups;
cfg.general.recordPerIteration = showIterations;
cfg.general.useInteractive = useInteractive;
cfg.data.alwaysValidate = alwaysValidate;
cfg.data.blockBytes = blockBytes;
cfg.data.byteOffset = byteOffset;
cfg.data.validateDirect = validateDirect;
cfg.data.validateSource = validateSource;
cfg.data.fillPattern = fillPattern;
cfg.dma.useHipEvents = useHipEvents;
cfg.dma.useHsaCopy = useHsaDma;
cfg.gfx.blockOrder = gfxBlockOrder;
cfg.gfx.blockSize = gfxBlockSize;
cfg.gfx.cuMask = cuMask;
cfg.gfx.prefXccTable = prefXccTable;
cfg.gfx.unrollFactor = gfxUnroll;
cfg.gfx.temporalMode = gfxTemporal;
cfg.gfx.useHipEvents = useHipEvents;
cfg.gfx.useMultiStream = !useSingleStream;
cfg.gfx.useSingleTeam = gfxSingleTeam;
cfg.gfx.waveOrder = gfxWaveOrder;
cfg.gfx.wordSize = gfxWordSize;
cfg.nic.ibGidIndex = ibGidIndex;
cfg.nic.ibPort = ibPort;
cfg.nic.ipAddressFamily = ipAddressFamily;
cfg.nic.useRelaxedOrder = nicRelaxedOrder;
cfg.nic.roceVersion = roceVersion;
std::vector<int> closestNics;
if(closestNicStr != "") {
std::stringstream ss(closestNicStr);
std::string item;
while (std::getline(ss, item, ',')) {
try {
int nic = std::stoi(item);
closestNics.push_back(nic);
} catch (const std::invalid_argument& e) {
printf("[ERROR] Invalid NIC index (%s) by user in %s\n", item.c_str(), closestNicStr.c_str());
exit(1);
}
}
cfg.nic.closestNics = closestNics;
}
return cfg;
}
};
#endif
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "EnvVars.hpp"
void AllToAllPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2,
A2A_CUSTOM = 3,
};
char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
// Force to gfx unroll 2 unless explicitly set
ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2);
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8);
int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
// A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
int numSrcs, numDsts;
int a2aMode = 0;
if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
a2aMode = A2A_CUSTOM;
} else {
a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
if (a2aMode < 0 || a2aMode > 2) {
printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
exit(1);
}
numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
}
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[AllToAll Related]\n");
ev.Print("A2A_DIRECT" , a2aDirect , a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs);
ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX");
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
if (useDmaExec && (numSrcs != 1 || numDsts != 1)) {
printf("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n");
exit(1);
}
// Collect the number of GPU devices to use
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpus; i++) {
for (int j = 0; j < numGpus; j++) {
// Check whether or not to execute this pair
if (i == j) {
if (!a2aLocal) continue;
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, i});
// When using multiple destinations, the additional destinations are "local"
if (numDsts) transfer.dsts.push_back({memType, j});
for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
transfer.exeSubIndex = -1;
transfer.numSubExecs = numSubExecs;
reIndex[std::make_pair(i,j)] = transfers.size();
transfers.push_back(transfer);
}
}
// Create a ring using NICs
std::vector<int> nicTransferIdx(numGpus);
if (numQueuePairs > 0) {
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
for (int i = 0; i < numGpus; i++) {
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({memType, i});
transfer.dsts.push_back({memType, (i+1) % numGpus});
transfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, i};
transfer.exeSubIndex = (i+1) % numGpus;
transfer.numSubExecs = numQueuePairs;
nicTransferIdx[i] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
if (transfers.size() == 0) return;
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults)
printf("%s\n", err.errMsg.c_str());
exit(0);
} else {
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
numBytesPerTransfer, useDmaExec ? "DMA" : "GFX", numSubExecs, numSrcs, numDsts);
printf("===========================================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
if (numQueuePairs > 0)
printf("%cNIC(%02d QP)", separator, numQueuePairs);
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minActualBandwidth = std::numeric_limits<double>::max();
double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0;
int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) {
int const transferIdx = reIndex[std::make_pair(src,dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else {
printf("%c%8s ", separator, "N/A");
}
}
if (numQueuePairs > 0) {
TransferBench::TransferResult const& r = results.tfrResults[nicTransferIdx[src]];
colTotalBandwidth[numGpus] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
if (numQueuePairs > 0) {
printf("%c%8.3f ", separator, colTotalBandwidth[numGpus]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
separator, minActualBandwidth, separator, maxActualBandwidth);
printf("\n");
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "EnvVars.hpp"
void AllToAllRdmaPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[AllToAll Network Related]\n");
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpus; i++) {
for (int j = 0; j < numGpus; j++) {
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({memType, i});
transfer.dsts.push_back({memType, j});
transfer.exeDevice = {EXE_NIC_NEAREST, i};
transfer.exeSubIndex = j;
transfer.numSubExecs = numQueuePairs;
reIndex[std::make_pair(i,j)] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-RDMA All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between all pairs of GPUs using %d QPs per Transfer (%lu Transfers)\n",
numBytesPerTransfer, numQueuePairs, transfers.size());
if (transfers.size() == 0) return;
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults)
printf("%s\n", err.errMsg.c_str());
exit(0);
} else {
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
printf("==========================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minActualBandwidth = std::numeric_limits<double>::max();
double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0;
int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) {
int const transferIdx = reIndex[std::make_pair(src,dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else {
printf("%c%8s ", separator, "N/A");
}
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
separator, minActualBandwidth, separator, maxActualBandwidth);
printf("\n");
printf("Average bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "EnvVars.hpp"
void AllToAllSweepPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2,
A2A_CUSTOM = 3,
};
char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int showMinOnly = EnvVars::GetEnvVar("SHOW_MIN_ONLY", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0);
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
// A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
int numSrcs, numDsts;
int a2aMode = 0;
if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
a2aMode = A2A_CUSTOM;
} else {
a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
if (a2aMode < 0 || a2aMode > 2) {
printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
exit(1);
}
numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
}
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[AllToAll Related]\n");
ev.Print("A2A_DIRECT" , a2aDirect , a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
ev.Print("SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("UNROLLS" , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ", useRemoteRead , "Using %s as executor", useRemoteRead ? "DST" : "SRC");
ev.Print("USE_SPRAY" , useSpray , "%s per CU", useSpray ? "All targets" : "One target");
ev.Print("VERBOSE" , verbose , verbose ? "Display test results" : "Display summary only");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
if (useSpray && numDsts > 1) {
printf("[ERROR] Cannot use USE_SPRAY with multiple destination buffers\n");
exit(1);
}
// Collect the number of GPU devices to use
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = EXE_GPU_GFX;
std::vector<Transfer> transfers;
int targetCount = 0;
if (!useSpray) {
// Each CU will work on just one target
for (int i = 0; i < numGpus; i++) {
targetCount = 0;
for (int j = 0; j < numGpus; j++) {
// Check whether or not to execute this pair
if (i == j) {
if (!a2aLocal) continue;
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
targetCount++;
transfer.numBytes = numBytesPerTransfer;
for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, i});
// When using multiple destinations, the additional destinations are "local"
if (numDsts) transfer.dsts.push_back({memType, j});
for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
transfer.exeSubIndex = -1;
transfers.push_back(transfer);
}
}
} else {
// Each CU will work on all targets
for (int i = 0; i < numGpus; i++) {
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.exeDevice = {exeType, i};
transfer.exeSubIndex = -1;
targetCount = 0;
for (int j = 0; j < numGpus; j++) {
// Check whether or not to transfer to this GPU
if (i == j) {
if (!a2aLocal) continue;
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
targetCount++;
for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, useRemoteRead ? j : i});
if (numDsts) transfer.dsts.push_back({memType, j});
for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
}
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All Sweep benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs\n", numBytesPerTransfer, a2aDirect ? "directly connected" : "all");
if (transfers.size() == 0) {
printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
return;
}
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
// Run tests
std::map<std::pair<int, int>, TransferBench::TestResults> results;
// Display summary
printf("#CUs\\Unroll");
for (int u : unrollList) {
printf(" %d(Min) ", u);
if (!showMinOnly) printf(" %d(Max) ", u);
}
printf("\n");
for (int c : numCusList) {
printf(" %5d ", c); fflush(stdout);
for (int u : unrollList) {
ev.gfxUnroll = cfg.gfx.unrollFactor = u;
for (auto& transfer : transfers)
transfer.numSubExecs = useSpray ? (c * targetCount) : c;
double minBandwidth = std::numeric_limits<double>::max();
double maxBandwidth = std::numeric_limits<double>::min();
TransferBench::TestResults result;
if (TransferBench::RunTransfers(cfg, transfers, result)) {
for (auto const& exeResult : result.exeResults) {
minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
}
if (useSpray) {
minBandwidth *= targetCount;
maxBandwidth *= targetCount;
}
results[std::make_pair(c,u)] = result;
} else {
minBandwidth = 0.0;
}
printf(" %7.2f ", minBandwidth);
if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
fflush(stdout);
}
printf("\n"); fflush(stdout);
}
if (verbose) {
int testNum = 0;
for (int c : numCusList) {
for (int u : unrollList) {
printf("CUs: %d Unroll %d\n", c, u);
PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
}
}
}
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
void HealthCheckPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
// Check for supported platforms
#if defined(__NVCC__)
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
return;
#endif
bool hasFail = false;
// Force use of single stream
ev.useSingleStream = 1;
TransferBench::TestResults results;
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numGpuDevices != 8) {
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
exit(1);
}
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
{
printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
exit(1);
}
}
// Pass limits
double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
double a2aLimit = getenv("LIMIT_A2A") ? atof(getenv("LIMIT_A2A")) : (int)(45 * 0.95);
// Run CPU to GPU
// Run unidirectional read from CPU to GPU
printf("Testing unidirectional reads from CPU ");
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId};
t.numBytes = 64*1024*1024;
t.srcs = {{MEM_CPU, memIndex}};
t.dsts = {};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run unidirectional write from GPU to CPU
printf("Testing unidirectional writes to CPU ");
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeDevice = {EXE_GPU_GFX, gpuId};
t.numBytes = 64*1024*1024;
t.srcs = {};
t.dsts = {{MEM_CPU, memIndex}};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
} else {
PrintErrors(results.errResults);
}
if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run bidirectional tests
printf("Testing bidirectional reads + writes ");
{
ev.gfxUnroll = 4;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
if (memIndex == -1) {
printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
exit(1);
}
std::vector<Transfer> transfers(2);
Transfer& t0 = transfers[0];
Transfer& t1 = transfers[1];
t0.exeDevice = {EXE_GPU_GFX, gpuId};
t0.numBytes = 64*1024*1024;
t0.srcs = {{MEM_CPU, memIndex}};
t0.dsts = {};
t1.exeDevice = {EXE_GPU_GFX, gpuId};
t1.numBytes = 64*1024*1024;
t1.srcs = {};
t1.dsts = {{MEM_CPU, memIndex}};
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t0.numSubExecs = cu;
t1.numSubExecs = cu;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
results.tfrResults[1].avgBandwidthGbPerSec);
bestResult = std::max(bestResult, sum);
if (sum >= bdirLimit) {
passed = true;
break;
}
} else {
PrintErrors(results.errResults);
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
}
}
}
// Run XGMI tests:
printf("Testing all-to-all XGMI copies "); fflush(stdout);
{
// Force GFX unroll to 2 for MI300
ev.gfxUnroll = 2;
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
std::vector<Transfer> transfers;
for (int i = 0; i < numGpuDevices; i++) {
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue;
Transfer t;
t.numBytes = 64*1024*1024;
t.numSubExecs = 8;
t.exeDevice = {EXE_GPU_GFX, i};
t.srcs = {{MEM_GPU_FINE, i}};
t.dsts = {{MEM_GPU_FINE, j}};
transfers.push_back(t);
}
}
std::vector<std::pair<std::pair<int,int>, double>> fails;
if (TransferBench::RunTransfers(cfg, transfers, results)) {
int transferIdx = 0;
for (int i = 0; i < numGpuDevices; i++) {
printf("."); fflush(stdout);
for (int j = 0; j < numGpuDevices; j++) {
if (i == j) continue;
double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
if (bw < a2aLimit) {
fails.push_back(std::make_pair(std::make_pair(i,j), bw));
}
transferIdx++;
}
}
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
}
}
}
exit(hasFail ? 1 : 0);
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment