Unverified Commit cd80b3a3 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.61 (#174)


Co-authored-by: default avatarMustafa Abduljabbar <mustafa.abduljabbar@amd.com>
parent 856e3445
......@@ -3,6 +3,22 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.61.00
### Added
- Added a2a_n preset which conducts alltoall GPU-to-GPU tranfers over nearest NIC executors
- Re-implemented GFX_BLOCK_ORDER which allows for control over how threadblocks of multiple transfers are ordered
- 0 = sequential, 1 = interleaved, 2 = random
- Added a2asweep preset which tries various CU/unroll options for GFX-executed all-to-all
- Rewrite main GID index detection logic
- Show the GID index and description in the topology table. It is helpful for debugging purposes
- Added GFX_WORD_SIZE to allow for different packed float sizes to use for GFX kernel. Must be either 4 (default), 2 or 1
### Fixed
- Avoid build errors for CMake and Makefile if infiniband/verbs.h header is not present and disable NIC executor in such case
- Have a priority list of which GID entry to go for instead of hardcoding choices based on underdocumented user input (such as RoCE version and IP address family)
- Use link-local when it is the only choice (i.e. when routing information is not available beyond local link)
## v1.60.00
### Modified
- Reverted GFX_SINGLE_TEAM default back to 1
......
......@@ -57,17 +57,22 @@ set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
include_directories(${ROCM_PATH}/include)
find_library(IBVERBS_LIBRARY ibverbs)
if (IBVERBS_LIBRARY)
if (DEFINED ENV{DISABLE_NIC_EXEC})
message(STATUS "Disabling NIC Executor support")
else()
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs)
endif()
find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
if (DEFINED ENV{DISABLE_NIC_EXEC})
message(STATUS "Disabling NIC Executor support")
elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs)
else()
message(WARNING "IBVerbs library not found. Building without NIC executor support")
if (NOT IBVERBS_LIBRARY)
message(WARNING "IBVerbs library not found")
elseif (NOT IBVERBS_INCLUDE_DIR)
message(WARNING "infiniband/verbs.h not found")
endif()
message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
endif()
link_libraries(numa hsa-runtime64 pthread)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp)
......
#
# Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Configuration options
......@@ -12,8 +12,10 @@ NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda
CXX=$(NVCC)
else
EXE=TransferBench
CXX=$(HIPCC)
endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
......@@ -21,14 +23,24 @@ NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread
# Compile RDMA executor if IBVerbs is found in the Dynamic Linker cache
# Compile RDMA executor if
# 1) DISABLE_NIC_EXEC is not set to 1
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
NIC_ENABLED = 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifneq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
$(info lib IBVerbs not found)
else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
$(info infiniband/verbs.h not found)
else
LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
NIC_ENABLED = 1
endif
ifeq ($(NIC_ENABLED), 0)
$(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
endif
endif
all: $(EXE)
......
/*
Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......@@ -84,6 +84,7 @@ public:
int useHsaDma; // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
// GFX options
int gfxBlockOrder; // How threadblocks for multiple Transfers are ordered 0=sequential 1=interleaved
int gfxBlockSize; // Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask; // Bit-vector representing the CU mask
vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair
......@@ -92,6 +93,7 @@ public:
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
int gfxSingleTeam; // Team all subExecutors across the data array
int gfxWaveOrder; // GFX-kernel wavefront ordering
int gfxWordSize; // GFX-kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)
// Client options
int hideEnv; // Skip printing environment variable
......@@ -135,10 +137,12 @@ public:
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
gfxWordSize = GetEnvVar("GFX_WORD_SIZE" , 4);
hideEnv = GetEnvVar("HIDE_ENV" , 0);
minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC" , 1);
maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC" , 0);
......@@ -286,13 +290,23 @@ public:
}
}
static std::string ToStr(std::vector<int> const& values) {
std::string result = "";
bool isFirst = true;
for (int v : values) {
if (isFirst) isFirst = false;
else result += ",";
result += std::to_string(v);
}
return result;
}
// Display info on the env vars that can be used
static void DisplayUsage()
{
printf("Environment variables:\n");
printf("======================\n");
printf(" ALWAYS_VALIDATE - Validate after each iteration instead of once after all iterations\n");
printf(" BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n");
#if NIC_EXEC_ENABLED
......@@ -300,9 +314,12 @@ public:
#endif
printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" GFX_WORD_SIZE - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
#if NIC_EXEC_ENABLED
printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n");
......@@ -383,6 +400,8 @@ public:
"%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
"%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
Print("GFX_BLOCK_ORDER", gfxBlockOrder,
"Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved");
Print("GFX_BLOCK_SIZE", gfxBlockSize,
"Threadblock size of %d", gfxBlockSize);
Print("GFX_SINGLE_TEAM", gfxSingleTeam,
......@@ -397,6 +416,9 @@ public:
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll"));
Print("GFX_WORD_SIZE", gfxWordSize,
"Using GFX word size of %d (DWORDx%d)", gfxWordSize, gfxWordSize);
#if NIC_EXEC_ENABLED
Print("IP_ADDRESS_FAMILY", ipAddressFamily,
"IP address family is set to IPv%d", ipAddressFamily);
......@@ -462,6 +484,31 @@ public:
return defaultValue;
}
static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
{
if (getenv(varname.c_str())) {
char* rangeStr = getenv(varname.c_str());
std::set<int> values;
char* token = strtok(rangeStr, ",");
while (token) {
int start, end;
if (sscanf(token, "%d-%d", &start, &end) == 2) {
for (int i = start; i <= end; i++) values.insert(i);
} else if (sscanf(token, "%d", &start) == 1) {
values.insert(start);
} else {
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
std::vector<int> result;
for (auto v : values) result.push_back(v);
return result;
}
return defaultValue;
}
static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
{
if (getenv(varname.c_str()))
......@@ -524,6 +571,7 @@ public:
cfg.dma.useHipEvents = useHipEvents;
cfg.dma.useHsaCopy = useHsaDma;
cfg.gfx.blockOrder = gfxBlockOrder;
cfg.gfx.blockSize = gfxBlockSize;
cfg.gfx.cuMask = cuMask;
cfg.gfx.prefXccTable = prefXccTable;
......@@ -532,12 +580,13 @@ public:
cfg.gfx.useMultiStream = !useSingleStream;
cfg.gfx.useSingleTeam = gfxSingleTeam;
cfg.gfx.waveOrder = gfxWaveOrder;
cfg.gfx.wordSize = gfxWordSize;
cfg.nic.ibGidIndex = ibGidIndex;
cfg.nic.ibPort = ibPort;
cfg.nic.ipAddressFamily = ipAddressFamily;
cfg.nic.useRelaxedOrder = nicRelaxedOrder;
cfg.nic.roceVersion = roceVersion;
cfg.nic.ibGidIndex = ibGidIndex;
cfg.nic.ibPort = ibPort;
cfg.nic.ipAddressFamily = ipAddressFamily;
cfg.nic.useRelaxedOrder = nicRelaxedOrder;
cfg.nic.roceVersion = roceVersion;
std::vector<int> closestNics;
if(closestNicStr != "") {
......
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "EnvVars.hpp"
void AllToAllRdmaPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[AllToAll Network Related]\n");
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpus; i++) {
for (int j = 0; j < numGpus; j++) {
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({memType, i});
transfer.dsts.push_back({memType, j});
transfer.exeDevice = {EXE_NIC_NEAREST, i};
transfer.exeSubIndex = j;
transfer.numSubExecs = numQueuePairs;
reIndex[std::make_pair(i,j)] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-RDMA All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between all pairs of GPUs using %d QPs per Transfer (%lu Transfers)\n",
numBytesPerTransfer, numQueuePairs, transfers.size());
if (transfers.size() == 0) return;
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults)
printf("%s\n", err.errMsg.c_str());
exit(0);
} else {
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
printf("==========================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minActualBandwidth = std::numeric_limits<double>::max();
double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0;
int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) {
int const transferIdx = reIndex[std::make_pair(src,dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else {
printf("%c%8s ", separator, "N/A");
}
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
separator, minActualBandwidth, separator, maxActualBandwidth);
printf("\n");
printf("Average bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "EnvVars.hpp"
void AllToAllSweepPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2,
A2A_CUSTOM = 3,
};
char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0);
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
// A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
int numSrcs, numDsts;
int a2aMode = 0;
if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
a2aMode = A2A_CUSTOM;
} else {
a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
if (a2aMode < 0 || a2aMode > 2) {
printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
exit(1);
}
numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
}
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[AllToAll Related]\n");
ev.Print("A2A_DIRECT" , a2aDirect , a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("UNROLLS" , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ", useRemoteRead , "Using %s as executor", useRemoteRead ? "DST" : "SRC");
ev.Print("USE_SPRAY" , useSpray , "%s per CU", useSpray ? "All targets" : "One target");
ev.Print("VERBOSE" , verbose , verbose ? "Display test results" : "Display summary only");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
if (useSpray && numDsts > 1) {
printf("[ERROR] Cannot use USE_SPRAY with multiple destination buffers\n");
exit(1);
}
// Collect the number of GPU devices to use
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = EXE_GPU_GFX;
std::vector<Transfer> transfers;
int targetCount = 0;
if (!useSpray) {
// Each CU will work on just one target
for (int i = 0; i < numGpus; i++) {
targetCount = 0;
for (int j = 0; j < numGpus; j++) {
// Check whether or not to execute this pair
if (i == j) {
if (!a2aLocal) continue;
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
targetCount++;
transfer.numBytes = numBytesPerTransfer;
for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, i});
// When using multiple destinations, the additional destinations are "local"
if (numDsts) transfer.dsts.push_back({memType, j});
for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
transfer.exeSubIndex = -1;
transfers.push_back(transfer);
}
}
} else {
// Each CU will work on all targets
for (int i = 0; i < numGpus; i++) {
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.exeDevice = {exeType, i};
transfer.exeSubIndex = -1;
targetCount = 0;
for (int j = 0; j < numGpus; j++) {
// Check whether or not to transfer to this GPU
if (i == j) {
if (!a2aLocal) continue;
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
targetCount++;
for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, useRemoteRead ? j : i});
if (numDsts) transfer.dsts.push_back({memType, j});
for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
}
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All Sweep benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs\n", numBytesPerTransfer, a2aDirect ? "directly connected" : "all");
if (transfers.size() == 0) {
printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
return;
}
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
// Run tests
std::map<std::pair<int, int>, TransferBench::TestResults> results;
// Display summary
printf("#CUs\\Unroll");
for (int u : unrollList) {
printf(" %d(Min) ", u);
printf(" %d(Max) ", u);
}
printf("\n");
for (int c : numCusList) {
printf(" %5d ", c); fflush(stdout);
for (int u : unrollList) {
ev.gfxUnroll = cfg.gfx.unrollFactor = u;
for (auto& transfer : transfers)
transfer.numSubExecs = useSpray ? (c * targetCount) : c;
double minBandwidth = std::numeric_limits<double>::max();
double maxBandwidth = std::numeric_limits<double>::min();
TransferBench::TestResults result;
if (TransferBench::RunTransfers(cfg, transfers, result)) {
for (auto const& exeResult : result.exeResults) {
minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
}
if (useSpray) {
minBandwidth *= targetCount;
maxBandwidth *= targetCount;
}
results[std::make_pair(c,u)] = result;
} else {
minBandwidth = 0.0;
}
printf(" %7.2f %7.2f ", minBandwidth, maxBandwidth); fflush(stdout);
}
printf("\n"); fflush(stdout);
}
if (verbose) {
int testNum = 0;
for (int c : numCusList) {
for (int u : unrollList) {
printf("CUs: %d Unroll %d\n", c, u);
PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
}
}
}
}
......@@ -24,6 +24,8 @@ THE SOFTWARE.
// Included after EnvVars and Executors
#include "AllToAll.hpp"
#include "AllToAllN.hpp"
#include "AllToAllSweep.hpp"
#include "HealthCheck.hpp"
#include "OneToAll.hpp"
#include "PeerToPeer.hpp"
......@@ -38,14 +40,16 @@ typedef void (*PresetFunc)(EnvVars& ev,
std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
{
{"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}},
{"healthcheck", {HealthCheckPreset,"Simple bandwidth health check (MI300X series only)"}},
{"one2all", {OneToAllPreset, "Test all subsets of parallel transfers from one GPU to all others"}},
{"p2p" , {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}},
{"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}},
{"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}},
{"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}},
{"sweep", {SweepPreset, "Ordered sweep through sets of Transfers"}},
{"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}},
{"a2a_n", {AllToAllRdmaPreset, "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
{"a2asweep", {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
{"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}},
{"one2all", {OneToAllPreset, "Test all subsets of parallel transfers from one GPU to all others"}},
{"p2p" , {PeerToPeerPreset, " Peer-to-peer device memory bandwidth test"}},
{"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}},
{"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}},
{"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}},
{"sweep", {SweepPreset, "Ordered sweep through sets of Transfers"}},
};
void DisplayPresets()
......
......@@ -41,9 +41,9 @@ static int RemappedCpuIndex(int origIdx)
static void PrintNicToGPUTopo(bool outputToCsv)
{
#ifdef NIC_EXEC_ENABLED
printf(" NIC | Device Name | Active | PCIe Bus ID | NUMA | Closest GPU(s)\n");
printf(" NIC | Device Name | Active | PCIe Bus ID | NUMA | Closest GPU(s) | GID Index | GID Descriptor\n");
if(!outputToCsv)
printf("-----+-------------+--------+--------------+------+---------------\n");
printf("-----+-------------+--------+--------------+------+----------------+-----------+-------------------\n");
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
auto const& ibvDeviceList = GetIbvDeviceList();
......@@ -57,12 +57,15 @@ static void PrintNicToGPUTopo(bool outputToCsv)
}
}
printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-20s\n",
printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-14s | %-9s | %-20s\n",
i, ibvDeviceList[i].name.c_str(),
ibvDeviceList[i].hasActivePort ? "Yes" : "No",
ibvDeviceList[i].busId.c_str(),
ibvDeviceList[i].numaNode,
closestGpusStr.c_str());
closestGpusStr.c_str(),
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A",
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? ibvDeviceList[i].gidDescriptor.c_str() : "N/A"
);
}
printf("\n");
#endif
......
......@@ -22,11 +22,13 @@ THE SOFTWARE.
/// @cond
#pragma once
#include <algorithm>
#include <cstring>
#include <future>
#include <map>
#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
#include <numaif.h>
#include <random>
#include <set>
#include <sstream>
#include <stdarg.h>
......@@ -64,7 +66,7 @@ namespace TransferBench
using std::set;
using std::vector;
constexpr char VERSION[] = "1.60";
constexpr char VERSION[] = "1.61";
/**
* Enumeration of supported Executor types
......@@ -173,6 +175,7 @@ namespace TransferBench
*/
struct GfxOptions
{
int blockOrder = 0; ///< Determines how threadblocks are ordered (0=sequential, 1=interleaved, 2=random)
int blockSize = 256; ///< Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask = {}; ///< Bit-vector representing the CU mask
vector<vector<int>> prefXccTable = {}; ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
......@@ -181,6 +184,7 @@ namespace TransferBench
int useMultiStream = 0; ///< Use multiple streams for GFX
int useSingleTeam = 0; ///< Team all subExecutors across the data array
int waveOrder = 0; ///< GFX-kernel wavefront ordering
int wordSize = 4; ///< GFX-kernel packed data size (4=dwordx4, 2=dwordx2, 1=dwordx1)
};
/**
......@@ -233,6 +237,31 @@ namespace TransferBench
ERR_FATAL = 2, ///< Fatal error - results are invalid
};
/**
* Enumeration of GID priority
*
* @note These are the GID types ordered in priority from lowest (0) to highest
*/
enum GidPriority
{
UNKNOWN = -1, ///< Default
ROCEV1_LINK_LOCAL = 0, ///< RoCEv1 Link-local
ROCEV2_LINK_LOCAL = 1, ///< RoCEv2 Link-local fe80::/10
ROCEV1_IPV6 = 2, ///< RoCEv1 IPv6
ROCEV2_IPV6 = 3, ///< RoCEv2 IPv6
ROCEV1_IPV4 = 4, ///< RoCEv1 IPv4-mapped IPv6
ROCEV2_IPV4 = 5, ///< RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x
};
const char* GidPriorityStr[] = {
"RoCEv1 Link-local",
"RoCEv2 Link-local",
"RoCEv1 IPv6",
"RoCEv2 IPv6",
"RoCEv1 IPv4-mapped IPv6",
"RoCEv2 IPv4-mapped IPv6"
};
/**
* ErrResult consists of error type and error message
*/
......@@ -463,6 +492,14 @@ namespace TransferBench
#define hipStreamDestroy cudaStreamDestroy
#define hipStreamSynchronize cudaStreamSynchronize
// Define float2 addition operator for NVIDIA platform
__device__ inline float2& operator +=(float2& a, const float2& b)
{
a.x += b.x;
a.y += b.y;
return a;
}
// Define float4 addition operator for NVIDIA platform
__device__ inline float4& operator +=(float4& a, const float4& b)
{
......@@ -924,6 +961,13 @@ namespace {
errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
// Check GFX options
if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2)
errors.push_back({ERR_FATAL,
"[gfx.blockOrder] must be 0 for sequential, 1 for interleaved, or 2 for random"});
if (cfg.gfx.useMultiStream && cfg.gfx.blockOrder > 0)
errors.push_back({ERR_WARN, "[gfx.blockOrder] will be ignored when running in multi-stream mode"});
int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE);
if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize)
errors.push_back({ERR_FATAL,
......@@ -939,6 +983,9 @@ namespace {
errors.push_back({ERR_FATAL,
"[gfx.waveOrder] must be non-negative and less than 6"});
if (!(cfg.gfx.wordSize == 1 || cfg.gfx.wordSize == 2 || cfg.gfx.wordSize == 4))
errors.push_back({ERR_FATAL, "[gfx.wordSize] must be either 1, 2 or 4"});
int numGpus = GetNumExecutors(EXE_GPU_GFX);
int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
vector<vector<int>> const& table = cfg.gfx.prefXccTable;
......@@ -1395,12 +1442,122 @@ namespace {
std::string busId;
bool hasActivePort;
int numaNode;
int gidIndex;
std::string gidDescriptor;
bool isRoce;
};
#endif
#ifdef NIC_EXEC_ENABLED
// Function to collect information about IBV devices
//========================================================================================
static bool IsConfiguredGid(union ibv_gid const& gid)
{
const struct in6_addr *a = (struct in6_addr *) gid.raw;
int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
if (((a->s6_addr32[0] | trailer) == 0UL) ||
((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
return false;
}
return true;
}
static bool LinkLocalGid(union ibv_gid const& gid)
{
const struct in6_addr *a = (struct in6_addr *) gid.raw;
if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
return true;
}
return false;
}
static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
int const& portNum,
int const& gidIndex,
int& version)
{
char const* deviceName = ibv_get_device_name(context->device);
char gidRoceVerStr[16] = {};
char roceTypePath[PATH_MAX] = {};
sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
deviceName, portNum, gidIndex);
int fd = open(roceTypePath, O_RDONLY);
if (fd == -1)
return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath};
int ret = read(fd, gidRoceVerStr, 15);
close(fd);
if (ret == -1)
return {ERR_FATAL, "Failed while reading RoCE version"};
if (strlen(gidRoceVerStr)) {
if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0
|| strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
version = 1;
}
else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
version = 2;
}
}
return ERR_NONE;
}
static bool IsIPv4MappedIPv6(const union ibv_gid &gid)
{
// look for ::ffff:x.x.x.x format
// From Broadcom documentation
// https://techdocs.broadcom.com/us/en/storage-and-ethernet-connectivity/ethernet-nic-controllers/bcm957xxx/adapters/frequently-asked-questions1.html
// "The IPv4 address is really an IPv4 address mapped into the IPv6 address space.
// This can be identified by 80 “0” bits, followed by 16 “1” bits (“FFFF” in hexadecimal)
// followed by the original 32-bit IPv4 address."
return (gid.global.subnet_prefix == 0 &&
gid.raw[8] == 0 &&
gid.raw[9] == 0 &&
gid.raw[10] == 0xff &&
gid.raw[11] == 0xff);
}
static ErrResult GetGidIndex(struct ibv_context* context,
int const& gidTblLen,
int const& portNum,
std::pair<int, std::string>& gidInfo)
{
if(gidInfo.first >= 0) return ERR_NONE; // honor user choice
union ibv_gid gid;
GidPriority highestPriority = GidPriority::UNKNOWN;
int gidIndex = -1;
for (int i = 0; i < gidTblLen; ++i) {
IBV_CALL(ibv_query_gid, context, portNum, i, &gid);
if (!IsConfiguredGid(gid)) continue;
int gidCurrRoceVersion;
if(GetRoceVersionNumber(context, portNum, i, gidCurrRoceVersion).errType != ERR_NONE) continue;
GidPriority currPriority;
if (IsIPv4MappedIPv6(gid)) {
currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV4 : GidPriority::ROCEV1_IPV4;
} else if (!LinkLocalGid(gid)) {
currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV6 : GidPriority::ROCEV1_IPV6;
} else {
currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_LINK_LOCAL : GidPriority::ROCEV1_LINK_LOCAL;
}
if(currPriority > highestPriority) {
highestPriority = currPriority;
gidIndex = i;
}
}
if (highestPriority == GidPriority::UNKNOWN) {
gidInfo.first = -1;
return {ERR_FATAL, "Failed to auto-detect a valid GID index. Try setting it manually through IB_GID_INDEX"};
}
gidInfo.first = gidIndex;
gidInfo.second = GidPriorityStr[highestPriority];
return ERR_NONE;
}
static vector<IbvDevice>& GetIbvDeviceList()
{
static bool isInitialized = false;
......@@ -1425,12 +1582,25 @@ namespace {
if (context) {
struct ibv_device_attr deviceAttr;
if (!ibv_query_device(context, &deviceAttr)) {
int activePort;
ibvDevice.gidIndex = -1;
for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) {
struct ibv_port_attr portAttr;
if (ibv_query_port(context, port, &portAttr)) continue;
if (portAttr.state == IBV_PORT_ACTIVE)
if (portAttr.state == IBV_PORT_ACTIVE) {
activePort = port;
ibvDevice.hasActivePort = true;
break;
if(portAttr.link_layer == IBV_LINK_LAYER_ETHERNET) {
ibvDevice.isRoce = true;
std::pair<int, std::string> gidInfo (-1, "");
auto res = GetGidIndex(context, portAttr.gid_tbl_len, activePort, gidInfo);
if (res.errType == ERR_NONE) {
ibvDevice.gidIndex = gidInfo.first;
ibvDevice.gidDescriptor = gidInfo.second;
}
}
break;
}
}
}
ibv_close_device(context);
......@@ -1781,164 +1951,6 @@ namespace {
return ERR_NONE;
}
static bool IsConfiguredGid(union ibv_gid* gid)
{
const struct in6_addr *a = (struct in6_addr *)gid->raw;
int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
if (((a->s6_addr32[0] | trailer) == 0UL) ||
((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
return false;
}
return true;
}
static bool LinkLocalGid(union ibv_gid* gid)
{
const struct in6_addr *a = (struct in6_addr *)gid->raw;
if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
return true;
}
return false;
}
static bool IsValidGid(union ibv_gid* gid)
{
return (IsConfiguredGid(gid) && !LinkLocalGid(gid));
}
static sa_family_t GetGidAddressFamily(union ibv_gid* gid)
{
const struct in6_addr *a = (struct in6_addr *)gid->raw;
bool isIpV4Mapped = ((a->s6_addr32[0] | a->s6_addr32[1]) |
(a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL;
bool isIpV4MappedMulticast = (a->s6_addr32[0] == htonl(0xff0e0000) &&
((a->s6_addr32[1] |
(a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL));
return (isIpV4Mapped || isIpV4MappedMulticast) ? AF_INET : AF_INET6;
}
static bool MatchGidAddressFamily(sa_family_t const& af,
void* prefix,
int prefixLen,
union ibv_gid* gid)
{
struct in_addr *base = NULL;
struct in6_addr *base6 = NULL;
struct in6_addr *addr6 = NULL;;
if (af == AF_INET) {
base = (struct in_addr *)prefix;
} else {
base6 = (struct in6_addr *)prefix;
}
addr6 = (struct in6_addr *)gid->raw;
#define NETMASK(bits) (htonl(0xffffffff ^ ((1 << (32 - bits)) - 1)))
int i = 0;
while (prefixLen > 0 && i < 4) {
if (af == AF_INET) {
int mask = NETMASK(prefixLen);
if ((base->s_addr & mask) ^ (addr6->s6_addr32[3] & mask))
break;
prefixLen = 0;
break;
} else {
if (prefixLen >= 32) {
if (base6->s6_addr32[i] ^ addr6->s6_addr32[i])
break;
prefixLen -= 32;
++i;
} else {
int mask = NETMASK(prefixLen);
if ((base6->s6_addr32[i] & mask) ^ (addr6->s6_addr32[i] & mask))
break;
prefixLen = 0;
}
}
}
return (prefixLen == 0) ? true : false;
#undef NETMASK
}
static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
int const& portNum,
int const& gidIndex,
int* version)
{
char const* deviceName = ibv_get_device_name(context->device);
char gidRoceVerStr[16] = {};
char roceTypePath[PATH_MAX] = {};
sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
deviceName, portNum, gidIndex);
int fd = open(roceTypePath, O_RDONLY);
if (fd == -1)
return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath};
int ret = read(fd, gidRoceVerStr, 15);
close(fd);
if (ret == -1)
return {ERR_FATAL, "Failed while reading RoCE version"};
if (strlen(gidRoceVerStr)) {
if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0
|| strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
*version = 1;
}
else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
*version = 2;
}
}
return ERR_NONE;
}
static ErrResult GetGidIndex(ConfigOptions const& cfg,
struct ibv_context* context,
int const& gidTblLen,
int& gidIndex)
{
// Use GID index if user specified
if (gidIndex >= 0) return ERR_NONE;
// Try to find the best GID index
int port = cfg.nic.ibPort;
sa_family_t targetAddrFam = (cfg.nic.ipAddressFamily == 6)? AF_INET6 : AF_INET;
int targetRoCEVer = cfg.nic.roceVersion;
// Initially assume gidIndex = 0
int gidIndexCurr = 0;
union ibv_gid gidCurr;
IBV_CALL(ibv_query_gid, context, port, gidIndexCurr, &gidCurr);
sa_family_t gidCurrFam = GetGidAddressFamily(&gidCurr);
bool gidCurrIsValid = IsValidGid(&gidCurr);
int gidCurrRoceVersion;
ERR_CHECK(GetRoceVersionNumber(context, port, gidIndexCurr, &gidCurrRoceVersion));
// Loop over GID table to find the best match
for (int gidIndexTest = 1; gidIndexTest < gidTblLen; ++gidIndexTest) {
union ibv_gid gidTest;
IBV_CALL(ibv_query_gid, context, cfg.nic.ibPort, gidIndexTest, &gidTest);
if (!IsValidGid(&gidTest)) continue;
sa_family_t gidTestFam = GetGidAddressFamily(&gidTest);
bool gidTestMatchSubnet = MatchGidAddressFamily(targetAddrFam, NULL, 0, &gidTest);
int gidTestRoceVersion;
ERR_CHECK(GetRoceVersionNumber(context, port, gidIndexTest, &gidTestRoceVersion));
if (!gidCurrIsValid ||
(gidTestFam == targetAddrFam && gidTestMatchSubnet &&
(gidCurrFam != targetAddrFam || gidTestRoceVersion == targetRoCEVer))) {
// Switch to better match
gidIndexCurr = gidIndexTest;
gidCurrFam = gidTestFam;
gidCurrIsValid = true;
gidCurrRoceVersion = gidTestRoceVersion;
}
}
gidIndex = gidIndexCurr;
return ERR_NONE;
}
static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg,
ExeDevice const& srcExeDevice,
Transfer const& t,
......@@ -2012,8 +2024,12 @@ namespace {
bool isRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET);
if (isRoCE) {
// Try to auto-detect the GID index
ERR_CHECK(GetGidIndex(cfg, rss.srcContext, rss.srcPortAttr.gid_tbl_len, srcGidIndex));
ERR_CHECK(GetGidIndex(cfg, rss.dstContext, rss.dstPortAttr.gid_tbl_len, dstGidIndex));
std::pair<int, std::string> srcGidInfo (srcGidIndex, "");
std::pair<int, std::string> dstGidInfo (dstGidIndex, "");
ERR_CHECK(GetGidIndex(rss.srcContext, rss.srcPortAttr.gid_tbl_len, cfg.nic.ibPort, srcGidInfo));
ERR_CHECK(GetGidIndex(rss.dstContext, rss.dstPortAttr.gid_tbl_len, cfg.nic.ibPort, dstGidInfo));
srcGidIndex = srcGidInfo.first;
dstGidIndex = dstGidInfo.first;
IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid);
IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid);
}
......@@ -2396,13 +2412,47 @@ namespace {
exeDevice.exeIndex));
#endif
int transferOffset = 0;
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
for (auto p : rss.subExecParamCpu) {
if (cfg.gfx.useMultiStream || cfg.gfx.blockOrder == 0) {
// Threadblocks are ordered sequentially one transfer at a time
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
for (auto p : rss.subExecParamCpu) {
rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
exeInfo.subExecParamCpu.push_back(p);
transferOffset++;
}
}
} else if (cfg.gfx.blockOrder == 1) {
// Interleave threadblocks of different Transfers
for (int subExecIdx = 0; exeInfo.subExecParamCpu.size() < exeInfo.totalSubExecs; ++subExecIdx) {
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
if (subExecIdx < t.numSubExecs) {
rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[subExecIdx]);
}
}
}
} else if (cfg.gfx.blockOrder == 2) {
// Build randomized threadblock list
std::vector<std::pair<int,int>> indices;
for (int i = 0; i < exeInfo.resources.size(); i++) {
auto const& rss = exeInfo.resources[i];
Transfer const& t = transfers[rss.transferIdx];
for (int j = 0; j < t.numSubExecs; j++)
indices.push_back(std::make_pair(i,j));
}
std::random_device rd;
std::default_random_engine gen(rd());
std::shuffle(indices.begin(), indices.end(), gen);
// Build randomized threadblock list
for (auto p : indices) {
auto& rss = exeInfo.resources[p.first];
rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
exeInfo.subExecParamCpu.push_back(p);
transferOffset++;
exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[p.second]);
}
}
......@@ -2595,50 +2645,15 @@ namespace {
int const exeIndex,
TransferResources& rss)
{
auto cpuStart = std::chrono::high_resolution_clock::now();
// Switch to the closest NUMA node to this NIC
if (cfg.nic.useNuma) {
int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
if (numaNode != -1)
numa_run_on_node(numaNode);
}
int subIteration = 0;
do {
// Loop over each of the queue pairs and post the send
ibv_send_wr* badWorkReq;
for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
int error = ibv_post_send(rss.srcQueuePairs[qpIndex], &rss.sendWorkRequests[qpIndex], &badWorkReq);
if (error)
return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n",
rss.transferIdx, qpIndex, error};
}
// Poll the completion queue until all queue pairs are complete
// The order of completion doesn't matter because this completion queue is dedicated to this Transfer
int numComplete = 0;
ibv_wc wc;
while (numComplete < rss.qpCount) {
int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc);
if (nc > 0) {
numComplete++;
if (wc.status != IBV_WC_SUCCESS) {
return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion", rss.transferIdx};
}
} else if (nc < 0) {
return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx};
}
}
} while (++subIteration != cfg.general.numSubIterations);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(deltaMsec);
// Loop over each of the queue pairs and post the send
ibv_send_wr* badWorkReq;
for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
int error = ibv_post_send(rss.srcQueuePairs[qpIndex], &rss.sendWorkRequests[qpIndex], &badWorkReq);
if (error)
return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n",
rss.transferIdx, qpIndex, error};
}
return ERR_NONE;
}
......@@ -2649,26 +2664,59 @@ namespace {
int const exeIndex,
ExeInfo& exeInfo)
{
vector<std::future<ErrResult>> asyncTransfers;
auto cpuStart = std::chrono::high_resolution_clock::now();
for (int i = 0; i < exeInfo.resources.size(); i++) {
asyncTransfers.emplace_back(std::async(std::launch::async,
ExecuteNicTransfer,
iteration,
std::cref(cfg),
exeIndex,
std::ref(exeInfo.resources[i])));
// Switch to the closest NUMA node to this NIC
if (cfg.nic.useNuma) {
int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
if (numaNode != -1)
numa_run_on_node(numaNode);
}
for (auto& asyncTransfer : asyncTransfers)
ERR_CHECK(asyncTransfer.get());
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec;
int subIterations = 0;
do {
auto cpuStart = std::chrono::high_resolution_clock::now();
size_t completedTransfers = 0;
auto transferCount = exeInfo.resources.size();
std::vector<uint8_t> receivedQPs(transferCount);
std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
// post the sends
for (auto i = 0; i < transferCount; i++) {
transferTimers[i] = std::chrono::high_resolution_clock::now();
ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
}
// poll for completions
do {
for (auto i = 0; i < transferCount; i++) {
if(receivedQPs[i] < exeInfo.resources[i].qpCount) {
auto& rss = exeInfo.resources[i];
// Poll the completion queue until all queue pairs are complete
// The order of completion doesn't matter because this completion queue is dedicated to this Transfer
ibv_wc wc;
int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc);
if (nc > 0) {
receivedQPs[i]++;
if (wc.status != IBV_WC_SUCCESS) {
return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion", rss.transferIdx};
}
} else if (nc < 0) {
return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx};
}
if(receivedQPs[i] == rss.qpCount) {
auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i];
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(deltaMsec);
}
completedTransfers++;
}
}
}
} while(completedTransfers < transferCount);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec;
} while(++subIterations < cfg.general.numSubIterations);
return ERR_NONE;
}
#endif
......@@ -2704,13 +2752,16 @@ namespace {
// Helper function for memset
template <typename T> __device__ __forceinline__ T MemsetVal();
template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; };
template <> __device__ __forceinline__ float2 MemsetVal(){ return make_float2(MEMSET_VAL,
MEMSET_VAL); };
template <> __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL,
MEMSET_VAL,
MEMSET_VAL,
MEMSET_VAL); }
// Kernel for GFX execution
template <int BLOCKSIZE, int UNROLL>
// Kernel for GFX execution
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
{
......@@ -2729,10 +2780,10 @@ namespace {
// Collect data information
int32_t const numSrcs = p.numSrcs;
int32_t const numDsts = p.numDsts;
float4 const* __restrict__ srcFloat4[MAX_SRCS];
float4* __restrict__ dstFloat4[MAX_DSTS];
for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i];
for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i];
PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
PACKED_FLOAT* __restrict__ dstFloatPacked[MAX_DSTS];
for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i];
for (int i = 0; i < numDsts; i++) dstFloatPacked[i] = (PACKED_FLOAT*)p.dst[i];
// Operate on wavefront granularity
int32_t const nTeams = p.teamSize; // Number of threadblocks working together on this subarray
......@@ -2741,7 +2792,7 @@ namespace {
int32_t const waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront
size_t const numFloat4 = p.N / 4;
size_t const numPackedFloat = p.N / (sizeof(PACKED_FLOAT)/sizeof(float));
int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
switch (waveOrder) {
......@@ -2755,64 +2806,64 @@ namespace {
int subIterations = 0;
while (1) {
// First loop: Each wavefront in the team works on UNROLL float4s per thread
// First loop: Each wavefront in the team works on UNROLL PACKED_FLOAT per thread
size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride;
size_t const loop1Limit = numPackedFloat / loop1Stride * loop1Stride;
{
float4 val[UNROLL];
PACKED_FLOAT val[UNROLL];
if (numSrcs == 0) {
#pragma unroll
for (int u = 0; u < UNROLL; u++)
val[u] = MemsetVal<float4>();
val[u] = MemsetVal<PACKED_FLOAT>();
}
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
// Read sources into memory and accumulate in registers
if (numSrcs) {
for (int u = 0; u < UNROLL; u++)
val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
val[u] = srcFloatPacked[0][idx + u * unrlStride * warpSize];
for (int s = 1; s < numSrcs; s++)
for (int u = 0; u < UNROLL; u++)
val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
val[u] += srcFloatPacked[s][idx + u * unrlStride * warpSize];
}
// Write accumulation to all outputs
for (int d = 0; d < numDsts; d++) {
#pragma unroll
for (int u = 0; u < UNROLL; u++)
dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
dstFloatPacked[d][idx + u * unrlStride * warpSize] = val[u];
}
}
}
// Second loop: Deal with remaining float4s
// Second loop: Deal with remaining PACKED_FLOAT
{
if (loop1Limit < numFloat4) {
float4 val;
if (numSrcs == 0) val = MemsetVal<float4>();
if (loop1Limit < numPackedFloat) {
PACKED_FLOAT val;
if (numSrcs == 0) val = MemsetVal<PACKED_FLOAT>();
size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
idx < numFloat4; idx += loop2Stride) {
idx < numPackedFloat; idx += loop2Stride) {
if (numSrcs) {
val = srcFloat4[0][idx];
val = srcFloatPacked[0][idx];
for (int s = 1; s < numSrcs; s++)
val += srcFloat4[s][idx];
val += srcFloatPacked[s][idx];
}
for (int d = 0; d < numDsts; d++)
dstFloat4[d][idx] = val;
dstFloatPacked[d][idx] = val;
}
}
}
// Third loop; Deal with remaining floats
{
if (numFloat4 * 4 < p.N) {
if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) {
float val;
if (numSrcs == 0) val = MemsetVal<float>();
size_t const loop3Stride = nTeams * nWaves * warpSize;
for ( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
if (numSrcs) {
val = p.src[0][idx];
for (int s = 1; s < numSrcs; s++)
......@@ -2839,19 +2890,24 @@ namespace {
}
}
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{GpuReduceKernel<BLOCKSIZE, 1>, \
GpuReduceKernel<BLOCKSIZE, 2>, \
GpuReduceKernel<BLOCKSIZE, 3>, \
GpuReduceKernel<BLOCKSIZE, 4>, \
GpuReduceKernel<BLOCKSIZE, 5>, \
GpuReduceKernel<BLOCKSIZE, 6>, \
GpuReduceKernel<BLOCKSIZE, 7>, \
GpuReduceKernel<BLOCKSIZE, 8>}
#define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL) \
{GpuReduceKernel<float, BLOCKSIZE, UNROLL>, \
GpuReduceKernel<float2, BLOCKSIZE, UNROLL>, \
GpuReduceKernel<float4, BLOCKSIZE, UNROLL>}
// Table of all GPU Reduction kernel functions (templated blocksize / unroll)
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 2), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 3), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 4), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 5), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 6), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)}
// Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] =
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3] =
{
GPU_KERNEL_UNROLL_DECL(64),
GPU_KERNEL_UNROLL_DECL(128),
......@@ -2879,18 +2935,19 @@ namespace {
dim3 const gridSize(xccDim, numSubExecs, 1);
dim3 const blockSize(cfg.gfx.blockSize, 1);
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 :
2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
#if defined(__NVCC__)
if (startEvent != NULL)
ERR_CHECK(hipEventRecord(startEvent, stream));
GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
<<<gridSize, blockSize, 0, stream>>>
(rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
gpuKernel<<<gridSize, blockSize, 0, stream>>>(rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (stopEvent != NULL)
ERR_CHECK(hipEventRecord(stopEvent, stream));
#else
hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
gridSize, blockSize, 0, stream, startEvent, stopEvent,
hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, startEvent, stopEvent,
0, rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
#endif
......@@ -2954,19 +3011,19 @@ namespace {
dim3 const blockSize(cfg.gfx.blockSize, 1);
hipStream_t stream = exeInfo.streams[0];
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 :
2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
#if defined(__NVCC__)
if (cfg.gfx.useHipEvents)
ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream));
GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
<<<gridSize, blockSize, 0 , stream>>>
(exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
gpuKernel<<<gridSize, blockSize, 0 , stream>>>(exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (cfg.gfx.useHipEvents)
ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream));
#else
hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
gridSize, blockSize, 0, stream,
hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream,
cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0,
exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment