Unverified Commit cd80b3a3 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.61 (#174)


Co-authored-by: default avatarMustafa Abduljabbar <mustafa.abduljabbar@amd.com>
parent 856e3445
...@@ -3,6 +3,22 @@ ...@@ -3,6 +3,22 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.61.00
### Added
- Added a2a_n preset which conducts alltoall GPU-to-GPU tranfers over nearest NIC executors
- Re-implemented GFX_BLOCK_ORDER which allows for control over how threadblocks of multiple transfers are ordered
- 0 = sequential, 1 = interleaved, 2 = random
- Added a2asweep preset which tries various CU/unroll options for GFX-executed all-to-all
- Rewrite main GID index detection logic
- Show the GID index and description in the topology table. It is helpful for debugging purposes
- Added GFX_WORD_SIZE to allow for different packed float sizes to use for GFX kernel. Must be either 4 (default), 2 or 1
### Fixed
- Avoid build errors for CMake and Makefile if infiniband/verbs.h header is not present and disable NIC executor in such case
- Have a priority list of which GID entry to go for instead of hardcoding choices based on underdocumented user input (such as RoCE version and IP address family)
- Use link-local when it is the only choice (i.e. when routing information is not available beyond local link)
## v1.60.00 ## v1.60.00
### Modified ### Modified
- Reverted GFX_SINGLE_TEAM default back to 1 - Reverted GFX_SINGLE_TEAM default back to 1
......
...@@ -57,17 +57,22 @@ set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}") ...@@ -57,17 +57,22 @@ set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib") set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
include_directories(${ROCM_PATH}/include) include_directories(${ROCM_PATH}/include)
find_library(IBVERBS_LIBRARY ibverbs) find_library(IBVERBS_LIBRARY ibverbs)
if (IBVERBS_LIBRARY) find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
if (DEFINED ENV{DISABLE_NIC_EXEC}) if (DEFINED ENV{DISABLE_NIC_EXEC})
message(STATUS "Disabling NIC Executor support") message(STATUS "Disabling NIC Executor support")
else() elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable") message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED) add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs) link_libraries(ibverbs)
endif()
else() else()
message(WARNING "IBVerbs library not found. Building without NIC executor support") if (NOT IBVERBS_LIBRARY)
message(WARNING "IBVerbs library not found")
elseif (NOT IBVERBS_INCLUDE_DIR)
message(WARNING "infiniband/verbs.h not found")
endif()
message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
endif() endif()
link_libraries(numa hsa-runtime64 pthread) link_libraries(numa hsa-runtime64 pthread)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .) set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp) add_executable(TransferBench src/client/Client.cpp)
......
# #
# Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
# #
# Configuration options # Configuration options
...@@ -12,8 +12,10 @@ NVCC=$(CUDA_PATH)/bin/nvcc ...@@ -12,8 +12,10 @@ NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected # Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found") ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda EXE=TransferBenchCuda
CXX=$(NVCC)
else else
EXE=TransferBench EXE=TransferBench
CXX=$(HIPCC)
endif endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
...@@ -21,14 +23,24 @@ NVFLAGS = -x cu -lnuma -arch=native ...@@ -21,14 +23,24 @@ NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread LDFLAGS += -lpthread
# Compile RDMA executor if IBVerbs is found in the Dynamic Linker cache # Compile RDMA executor if
# 1) DISABLE_NIC_EXEC is not set to 1
# 2) IBVerbs is found in the Dynamic Linker cache
# 3) infiniband/verbs.h is found in the default include path
NIC_ENABLED = 0 NIC_ENABLED = 0
ifneq ($(DISABLE_NIC_EXEC),1) ifneq ($(DISABLE_NIC_EXEC),1)
ifneq ("$(shell ldconfig -p | grep -c ibverbs)", "0") ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
$(info lib IBVerbs not found)
else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
$(info infiniband/verbs.h not found)
else
LDFLAGS += -libverbs -DNIC_EXEC_ENABLED LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
NVFLAGS += -libverbs -DNIC_EXEC_ENABLED NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
NIC_ENABLED = 1 NIC_ENABLED = 1
endif endif
ifeq ($(NIC_ENABLED), 0)
$(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
endif
endif endif
all: $(EXE) all: $(EXE)
......
/* /*
Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -84,6 +84,7 @@ public: ...@@ -84,6 +84,7 @@ public:
int useHsaDma; // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions int useHsaDma; // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
// GFX options // GFX options
int gfxBlockOrder; // How threadblocks for multiple Transfers are ordered 0=sequential 1=interleaved
int gfxBlockSize; // Size of each threadblock (must be multiple of 64) int gfxBlockSize; // Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask; // Bit-vector representing the CU mask vector<uint32_t> cuMask; // Bit-vector representing the CU mask
vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair vector<vector<int>> prefXccTable; // Specifies XCC to use for given exe->dst pair
...@@ -92,6 +93,7 @@ public: ...@@ -92,6 +93,7 @@ public:
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
int gfxSingleTeam; // Team all subExecutors across the data array int gfxSingleTeam; // Team all subExecutors across the data array
int gfxWaveOrder; // GFX-kernel wavefront ordering int gfxWaveOrder; // GFX-kernel wavefront ordering
int gfxWordSize; // GFX-kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)
// Client options // Client options
int hideEnv; // Skip printing environment variable int hideEnv; // Skip printing environment variable
...@@ -135,10 +137,12 @@ public: ...@@ -135,10 +137,12 @@ public:
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0); alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256); blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0); byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256); gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1); gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll); gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0); gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
gfxWordSize = GetEnvVar("GFX_WORD_SIZE" , 4);
hideEnv = GetEnvVar("HIDE_ENV" , 0); hideEnv = GetEnvVar("HIDE_ENV" , 0);
minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC" , 1); minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC" , 1);
maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC" , 0); maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC" , 0);
...@@ -286,13 +290,23 @@ public: ...@@ -286,13 +290,23 @@ public:
} }
} }
static std::string ToStr(std::vector<int> const& values) {
std::string result = "";
bool isFirst = true;
for (int v : values) {
if (isFirst) isFirst = false;
else result += ",";
result += std::to_string(v);
}
return result;
}
// Display info on the env vars that can be used // Display info on the env vars that can be used
static void DisplayUsage() static void DisplayUsage()
{ {
printf("Environment variables:\n"); printf("Environment variables:\n");
printf("======================\n"); printf("======================\n");
printf(" ALWAYS_VALIDATE - Validate after each iteration instead of once after all iterations\n"); printf(" ALWAYS_VALIDATE - Validate after each iteration instead of once after all iterations\n");
printf(" BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n"); printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n"); printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n");
#if NIC_EXEC_ENABLED #if NIC_EXEC_ENABLED
...@@ -300,9 +314,12 @@ public: ...@@ -300,9 +314,12 @@ public:
#endif #endif
printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n"); printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n"); printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL)); printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n"); printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n"); printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" GFX_WORD_SIZE - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n"); printf(" HIDE_ENV - Hide environment variable value listing\n");
#if NIC_EXEC_ENABLED #if NIC_EXEC_ENABLED
printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n"); printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n");
...@@ -383,6 +400,8 @@ public: ...@@ -383,6 +400,8 @@ public:
"%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All")); "%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0, Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
"%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str())); "%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
Print("GFX_BLOCK_ORDER", gfxBlockOrder,
"Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved");
Print("GFX_BLOCK_SIZE", gfxBlockSize, Print("GFX_BLOCK_SIZE", gfxBlockSize,
"Threadblock size of %d", gfxBlockSize); "Threadblock size of %d", gfxBlockSize);
Print("GFX_SINGLE_TEAM", gfxSingleTeam, Print("GFX_SINGLE_TEAM", gfxSingleTeam,
...@@ -397,6 +416,9 @@ public: ...@@ -397,6 +416,9 @@ public:
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" : gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" : gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll")); "CU,Wavefront,Unroll"));
Print("GFX_WORD_SIZE", gfxWordSize,
"Using GFX word size of %d (DWORDx%d)", gfxWordSize, gfxWordSize);
#if NIC_EXEC_ENABLED #if NIC_EXEC_ENABLED
Print("IP_ADDRESS_FAMILY", ipAddressFamily, Print("IP_ADDRESS_FAMILY", ipAddressFamily,
"IP address family is set to IPv%d", ipAddressFamily); "IP address family is set to IPv%d", ipAddressFamily);
...@@ -462,6 +484,31 @@ public: ...@@ -462,6 +484,31 @@ public:
return defaultValue; return defaultValue;
} }
static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
{
if (getenv(varname.c_str())) {
char* rangeStr = getenv(varname.c_str());
std::set<int> values;
char* token = strtok(rangeStr, ",");
while (token) {
int start, end;
if (sscanf(token, "%d-%d", &start, &end) == 2) {
for (int i = start; i <= end; i++) values.insert(i);
} else if (sscanf(token, "%d", &start) == 1) {
values.insert(start);
} else {
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
std::vector<int> result;
for (auto v : values) result.push_back(v);
return result;
}
return defaultValue;
}
static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue) static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
{ {
if (getenv(varname.c_str())) if (getenv(varname.c_str()))
...@@ -524,6 +571,7 @@ public: ...@@ -524,6 +571,7 @@ public:
cfg.dma.useHipEvents = useHipEvents; cfg.dma.useHipEvents = useHipEvents;
cfg.dma.useHsaCopy = useHsaDma; cfg.dma.useHsaCopy = useHsaDma;
cfg.gfx.blockOrder = gfxBlockOrder;
cfg.gfx.blockSize = gfxBlockSize; cfg.gfx.blockSize = gfxBlockSize;
cfg.gfx.cuMask = cuMask; cfg.gfx.cuMask = cuMask;
cfg.gfx.prefXccTable = prefXccTable; cfg.gfx.prefXccTable = prefXccTable;
...@@ -532,6 +580,7 @@ public: ...@@ -532,6 +580,7 @@ public:
cfg.gfx.useMultiStream = !useSingleStream; cfg.gfx.useMultiStream = !useSingleStream;
cfg.gfx.useSingleTeam = gfxSingleTeam; cfg.gfx.useSingleTeam = gfxSingleTeam;
cfg.gfx.waveOrder = gfxWaveOrder; cfg.gfx.waveOrder = gfxWaveOrder;
cfg.gfx.wordSize = gfxWordSize;
cfg.nic.ibGidIndex = ibGidIndex; cfg.nic.ibGidIndex = ibGidIndex;
cfg.nic.ibPort = ibPort; cfg.nic.ibPort = ibPort;
......
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "EnvVars.hpp"
void AllToAllRdmaPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[AllToAll Network Related]\n");
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
std::map<std::pair<int, int>, int> reIndex;
std::vector<Transfer> transfers;
for (int i = 0; i < numGpus; i++) {
for (int j = 0; j < numGpus; j++) {
// Build Transfer and add it to list
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({memType, i});
transfer.dsts.push_back({memType, j});
transfer.exeDevice = {EXE_NIC_NEAREST, i};
transfer.exeSubIndex = j;
transfer.numSubExecs = numQueuePairs;
reIndex[std::make_pair(i,j)] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-RDMA All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between all pairs of GPUs using %d QPs per Transfer (%lu Transfers)\n",
numBytesPerTransfer, numQueuePairs, transfers.size());
if (transfers.size() == 0) return;
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
TransferBench::TestResults results;
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
for (auto const& err : results.errResults)
printf("%s\n", err.errMsg.c_str());
exit(0);
} else {
PrintResults(ev, 1, transfers, results);
}
// Print results
char separator = (ev.outputToCsv ? ',' : ' ');
printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
printf("==========================================================\n");
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minActualBandwidth = std::numeric_limits<double>::max();
double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0;
int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) {
int const transferIdx = reIndex[std::make_pair(src,dst)];
TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else {
printf("%c%8s ", separator, "N/A");
}
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
separator, minActualBandwidth, separator, maxActualBandwidth);
printf("\n");
printf("Average bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
PrintErrors(results.errResults);
}
/*
Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#include "EnvVars.hpp"
void AllToAllSweepPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
enum
{
A2A_COPY = 0,
A2A_READ_ONLY = 1,
A2A_WRITE_ONLY = 2,
A2A_CUSTOM = 3,
};
char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
// Force single-stream mode for all-to-all benchmark
ev.useSingleStream = 1;
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
// Collect env vars for this preset
int a2aDirect = EnvVars::GetEnvVar("A2A_DIRECT" , 1);
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0);
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
// A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
int numSrcs, numDsts;
int a2aMode = 0;
if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
a2aMode = A2A_CUSTOM;
} else {
a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
if (a2aMode < 0 || a2aMode > 2) {
printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
exit(1);
}
numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
}
// Print off environment variables
ev.DisplayEnvVars();
if (!ev.hideEnv) {
if (!ev.outputToCsv) printf("[AllToAll Related]\n");
ev.Print("A2A_DIRECT" , a2aDirect , a2aDirect ? "Only using direct links" : "Full all-to-all");
ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("UNROLLS" , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
ev.Print("USE_REMOTE_READ", useRemoteRead , "Using %s as executor", useRemoteRead ? "DST" : "SRC");
ev.Print("USE_SPRAY" , useSpray , "%s per CU", useSpray ? "All targets" : "One target");
ev.Print("VERBOSE" , verbose , verbose ? "Display test results" : "Display summary only");
printf("\n");
}
// Validate env vars
if (numGpus < 0 || numGpus > numDetectedGpus) {
printf("[ERROR] Cannot use %d GPUs. Detected %d GPUs\n", numGpus, numDetectedGpus);
exit(1);
}
if (useSpray && numDsts > 1) {
printf("[ERROR] Cannot use USE_SPRAY with multiple destination buffers\n");
exit(1);
}
// Collect the number of GPU devices to use
MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
ExeType exeType = EXE_GPU_GFX;
std::vector<Transfer> transfers;
int targetCount = 0;
if (!useSpray) {
// Each CU will work on just one target
for (int i = 0; i < numGpus; i++) {
targetCount = 0;
for (int j = 0; j < numGpus; j++) {
// Check whether or not to execute this pair
if (i == j) {
if (!a2aLocal) continue;
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
// Build Transfer and add it to list
TransferBench::Transfer transfer;
targetCount++;
transfer.numBytes = numBytesPerTransfer;
for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, i});
// When using multiple destinations, the additional destinations are "local"
if (numDsts) transfer.dsts.push_back({memType, j});
for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
transfer.exeSubIndex = -1;
transfers.push_back(transfer);
}
}
} else {
// Each CU will work on all targets
for (int i = 0; i < numGpus; i++) {
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.exeDevice = {exeType, i};
transfer.exeSubIndex = -1;
targetCount = 0;
for (int j = 0; j < numGpus; j++) {
// Check whether or not to transfer to this GPU
if (i == j) {
if (!a2aLocal) continue;
} else if (a2aDirect) {
#if !defined(__NVCC__)
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
if (hopCount != 1) continue;
#endif
}
targetCount++;
for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, useRemoteRead ? j : i});
if (numDsts) transfer.dsts.push_back({memType, j});
for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
}
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All Sweep benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs\n", numBytesPerTransfer, a2aDirect ? "directly connected" : "all");
if (transfers.size() == 0) {
printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
return;
}
// Execute Transfers
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
// Run tests
std::map<std::pair<int, int>, TransferBench::TestResults> results;
// Display summary
printf("#CUs\\Unroll");
for (int u : unrollList) {
printf(" %d(Min) ", u);
printf(" %d(Max) ", u);
}
printf("\n");
for (int c : numCusList) {
printf(" %5d ", c); fflush(stdout);
for (int u : unrollList) {
ev.gfxUnroll = cfg.gfx.unrollFactor = u;
for (auto& transfer : transfers)
transfer.numSubExecs = useSpray ? (c * targetCount) : c;
double minBandwidth = std::numeric_limits<double>::max();
double maxBandwidth = std::numeric_limits<double>::min();
TransferBench::TestResults result;
if (TransferBench::RunTransfers(cfg, transfers, result)) {
for (auto const& exeResult : result.exeResults) {
minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
}
if (useSpray) {
minBandwidth *= targetCount;
maxBandwidth *= targetCount;
}
results[std::make_pair(c,u)] = result;
} else {
minBandwidth = 0.0;
}
printf(" %7.2f %7.2f ", minBandwidth, maxBandwidth); fflush(stdout);
}
printf("\n"); fflush(stdout);
}
if (verbose) {
int testNum = 0;
for (int c : numCusList) {
for (int u : unrollList) {
printf("CUs: %d Unroll %d\n", c, u);
PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
}
}
}
}
...@@ -24,6 +24,8 @@ THE SOFTWARE. ...@@ -24,6 +24,8 @@ THE SOFTWARE.
// Included after EnvVars and Executors // Included after EnvVars and Executors
#include "AllToAll.hpp" #include "AllToAll.hpp"
#include "AllToAllN.hpp"
#include "AllToAllSweep.hpp"
#include "HealthCheck.hpp" #include "HealthCheck.hpp"
#include "OneToAll.hpp" #include "OneToAll.hpp"
#include "PeerToPeer.hpp" #include "PeerToPeer.hpp"
...@@ -39,9 +41,11 @@ typedef void (*PresetFunc)(EnvVars& ev, ...@@ -39,9 +41,11 @@ typedef void (*PresetFunc)(EnvVars& ev,
std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap = std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
{ {
{"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}}, {"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}},
{"healthcheck", {HealthCheckPreset,"Simple bandwidth health check (MI300X series only)"}}, {"a2a_n", {AllToAllRdmaPreset, "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
{"a2asweep", {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
{"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}},
{"one2all", {OneToAllPreset, "Test all subsets of parallel transfers from one GPU to all others"}}, {"one2all", {OneToAllPreset, "Test all subsets of parallel transfers from one GPU to all others"}},
{"p2p" , {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}}, {"p2p" , {PeerToPeerPreset, " Peer-to-peer device memory bandwidth test"}},
{"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}}, {"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}},
{"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}}, {"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}},
{"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}}, {"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}},
......
...@@ -41,9 +41,9 @@ static int RemappedCpuIndex(int origIdx) ...@@ -41,9 +41,9 @@ static int RemappedCpuIndex(int origIdx)
static void PrintNicToGPUTopo(bool outputToCsv) static void PrintNicToGPUTopo(bool outputToCsv)
{ {
#ifdef NIC_EXEC_ENABLED #ifdef NIC_EXEC_ENABLED
printf(" NIC | Device Name | Active | PCIe Bus ID | NUMA | Closest GPU(s)\n"); printf(" NIC | Device Name | Active | PCIe Bus ID | NUMA | Closest GPU(s) | GID Index | GID Descriptor\n");
if(!outputToCsv) if(!outputToCsv)
printf("-----+-------------+--------+--------------+------+---------------\n"); printf("-----+-------------+--------+--------------+------+----------------+-----------+-------------------\n");
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
auto const& ibvDeviceList = GetIbvDeviceList(); auto const& ibvDeviceList = GetIbvDeviceList();
...@@ -57,12 +57,15 @@ static void PrintNicToGPUTopo(bool outputToCsv) ...@@ -57,12 +57,15 @@ static void PrintNicToGPUTopo(bool outputToCsv)
} }
} }
printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-20s\n", printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-14s | %-9s | %-20s\n",
i, ibvDeviceList[i].name.c_str(), i, ibvDeviceList[i].name.c_str(),
ibvDeviceList[i].hasActivePort ? "Yes" : "No", ibvDeviceList[i].hasActivePort ? "Yes" : "No",
ibvDeviceList[i].busId.c_str(), ibvDeviceList[i].busId.c_str(),
ibvDeviceList[i].numaNode, ibvDeviceList[i].numaNode,
closestGpusStr.c_str()); closestGpusStr.c_str(),
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A",
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? ibvDeviceList[i].gidDescriptor.c_str() : "N/A"
);
} }
printf("\n"); printf("\n");
#endif #endif
......
...@@ -22,11 +22,13 @@ THE SOFTWARE. ...@@ -22,11 +22,13 @@ THE SOFTWARE.
/// @cond /// @cond
#pragma once #pragma once
#include <algorithm>
#include <cstring> #include <cstring>
#include <future> #include <future>
#include <map> #include <map>
#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev) #include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
#include <numaif.h> #include <numaif.h>
#include <random>
#include <set> #include <set>
#include <sstream> #include <sstream>
#include <stdarg.h> #include <stdarg.h>
...@@ -64,7 +66,7 @@ namespace TransferBench ...@@ -64,7 +66,7 @@ namespace TransferBench
using std::set; using std::set;
using std::vector; using std::vector;
constexpr char VERSION[] = "1.60"; constexpr char VERSION[] = "1.61";
/** /**
* Enumeration of supported Executor types * Enumeration of supported Executor types
...@@ -173,6 +175,7 @@ namespace TransferBench ...@@ -173,6 +175,7 @@ namespace TransferBench
*/ */
struct GfxOptions struct GfxOptions
{ {
int blockOrder = 0; ///< Determines how threadblocks are ordered (0=sequential, 1=interleaved, 2=random)
int blockSize = 256; ///< Size of each threadblock (must be multiple of 64) int blockSize = 256; ///< Size of each threadblock (must be multiple of 64)
vector<uint32_t> cuMask = {}; ///< Bit-vector representing the CU mask vector<uint32_t> cuMask = {}; ///< Bit-vector representing the CU mask
vector<vector<int>> prefXccTable = {}; ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device vector<vector<int>> prefXccTable = {}; ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
...@@ -181,6 +184,7 @@ namespace TransferBench ...@@ -181,6 +184,7 @@ namespace TransferBench
int useMultiStream = 0; ///< Use multiple streams for GFX int useMultiStream = 0; ///< Use multiple streams for GFX
int useSingleTeam = 0; ///< Team all subExecutors across the data array int useSingleTeam = 0; ///< Team all subExecutors across the data array
int waveOrder = 0; ///< GFX-kernel wavefront ordering int waveOrder = 0; ///< GFX-kernel wavefront ordering
int wordSize = 4; ///< GFX-kernel packed data size (4=dwordx4, 2=dwordx2, 1=dwordx1)
}; };
/** /**
...@@ -233,6 +237,31 @@ namespace TransferBench ...@@ -233,6 +237,31 @@ namespace TransferBench
ERR_FATAL = 2, ///< Fatal error - results are invalid ERR_FATAL = 2, ///< Fatal error - results are invalid
}; };
/**
* Enumeration of GID priority
*
* @note These are the GID types ordered in priority from lowest (0) to highest
*/
enum GidPriority
{
UNKNOWN = -1, ///< Default
ROCEV1_LINK_LOCAL = 0, ///< RoCEv1 Link-local
ROCEV2_LINK_LOCAL = 1, ///< RoCEv2 Link-local fe80::/10
ROCEV1_IPV6 = 2, ///< RoCEv1 IPv6
ROCEV2_IPV6 = 3, ///< RoCEv2 IPv6
ROCEV1_IPV4 = 4, ///< RoCEv1 IPv4-mapped IPv6
ROCEV2_IPV4 = 5, ///< RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x
};
const char* GidPriorityStr[] = {
"RoCEv1 Link-local",
"RoCEv2 Link-local",
"RoCEv1 IPv6",
"RoCEv2 IPv6",
"RoCEv1 IPv4-mapped IPv6",
"RoCEv2 IPv4-mapped IPv6"
};
/** /**
* ErrResult consists of error type and error message * ErrResult consists of error type and error message
*/ */
...@@ -463,6 +492,14 @@ namespace TransferBench ...@@ -463,6 +492,14 @@ namespace TransferBench
#define hipStreamDestroy cudaStreamDestroy #define hipStreamDestroy cudaStreamDestroy
#define hipStreamSynchronize cudaStreamSynchronize #define hipStreamSynchronize cudaStreamSynchronize
// Define float2 addition operator for NVIDIA platform
__device__ inline float2& operator +=(float2& a, const float2& b)
{
a.x += b.x;
a.y += b.y;
return a;
}
// Define float4 addition operator for NVIDIA platform // Define float4 addition operator for NVIDIA platform
__device__ inline float4& operator +=(float4& a, const float4& b) __device__ inline float4& operator +=(float4& a, const float4& b)
{ {
...@@ -924,6 +961,13 @@ namespace { ...@@ -924,6 +961,13 @@ namespace {
errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)}); errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
// Check GFX options // Check GFX options
if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2)
errors.push_back({ERR_FATAL,
"[gfx.blockOrder] must be 0 for sequential, 1 for interleaved, or 2 for random"});
if (cfg.gfx.useMultiStream && cfg.gfx.blockOrder > 0)
errors.push_back({ERR_WARN, "[gfx.blockOrder] will be ignored when running in multi-stream mode"});
int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE); int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE);
if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize) if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize)
errors.push_back({ERR_FATAL, errors.push_back({ERR_FATAL,
...@@ -939,6 +983,9 @@ namespace { ...@@ -939,6 +983,9 @@ namespace {
errors.push_back({ERR_FATAL, errors.push_back({ERR_FATAL,
"[gfx.waveOrder] must be non-negative and less than 6"}); "[gfx.waveOrder] must be non-negative and less than 6"});
if (!(cfg.gfx.wordSize == 1 || cfg.gfx.wordSize == 2 || cfg.gfx.wordSize == 4))
errors.push_back({ERR_FATAL, "[gfx.wordSize] must be either 1, 2 or 4"});
int numGpus = GetNumExecutors(EXE_GPU_GFX); int numGpus = GetNumExecutors(EXE_GPU_GFX);
int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0}); int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
vector<vector<int>> const& table = cfg.gfx.prefXccTable; vector<vector<int>> const& table = cfg.gfx.prefXccTable;
...@@ -1395,12 +1442,122 @@ namespace { ...@@ -1395,12 +1442,122 @@ namespace {
std::string busId; std::string busId;
bool hasActivePort; bool hasActivePort;
int numaNode; int numaNode;
int gidIndex;
std::string gidDescriptor;
bool isRoce;
}; };
#endif #endif
#ifdef NIC_EXEC_ENABLED #ifdef NIC_EXEC_ENABLED
// Function to collect information about IBV devices // Function to collect information about IBV devices
//======================================================================================== //========================================================================================
static bool IsConfiguredGid(union ibv_gid const& gid)
{
const struct in6_addr *a = (struct in6_addr *) gid.raw;
int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
if (((a->s6_addr32[0] | trailer) == 0UL) ||
((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
return false;
}
return true;
}
static bool LinkLocalGid(union ibv_gid const& gid)
{
const struct in6_addr *a = (struct in6_addr *) gid.raw;
if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
return true;
}
return false;
}
static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
int const& portNum,
int const& gidIndex,
int& version)
{
char const* deviceName = ibv_get_device_name(context->device);
char gidRoceVerStr[16] = {};
char roceTypePath[PATH_MAX] = {};
sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
deviceName, portNum, gidIndex);
int fd = open(roceTypePath, O_RDONLY);
if (fd == -1)
return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath};
int ret = read(fd, gidRoceVerStr, 15);
close(fd);
if (ret == -1)
return {ERR_FATAL, "Failed while reading RoCE version"};
if (strlen(gidRoceVerStr)) {
if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0
|| strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
version = 1;
}
else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
version = 2;
}
}
return ERR_NONE;
}
static bool IsIPv4MappedIPv6(const union ibv_gid &gid)
{
// look for ::ffff:x.x.x.x format
// From Broadcom documentation
// https://techdocs.broadcom.com/us/en/storage-and-ethernet-connectivity/ethernet-nic-controllers/bcm957xxx/adapters/frequently-asked-questions1.html
// "The IPv4 address is really an IPv4 address mapped into the IPv6 address space.
// This can be identified by 80 “0” bits, followed by 16 “1” bits (“FFFF” in hexadecimal)
// followed by the original 32-bit IPv4 address."
return (gid.global.subnet_prefix == 0 &&
gid.raw[8] == 0 &&
gid.raw[9] == 0 &&
gid.raw[10] == 0xff &&
gid.raw[11] == 0xff);
}
static ErrResult GetGidIndex(struct ibv_context* context,
int const& gidTblLen,
int const& portNum,
std::pair<int, std::string>& gidInfo)
{
if(gidInfo.first >= 0) return ERR_NONE; // honor user choice
union ibv_gid gid;
GidPriority highestPriority = GidPriority::UNKNOWN;
int gidIndex = -1;
for (int i = 0; i < gidTblLen; ++i) {
IBV_CALL(ibv_query_gid, context, portNum, i, &gid);
if (!IsConfiguredGid(gid)) continue;
int gidCurrRoceVersion;
if(GetRoceVersionNumber(context, portNum, i, gidCurrRoceVersion).errType != ERR_NONE) continue;
GidPriority currPriority;
if (IsIPv4MappedIPv6(gid)) {
currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV4 : GidPriority::ROCEV1_IPV4;
} else if (!LinkLocalGid(gid)) {
currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV6 : GidPriority::ROCEV1_IPV6;
} else {
currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_LINK_LOCAL : GidPriority::ROCEV1_LINK_LOCAL;
}
if(currPriority > highestPriority) {
highestPriority = currPriority;
gidIndex = i;
}
}
if (highestPriority == GidPriority::UNKNOWN) {
gidInfo.first = -1;
return {ERR_FATAL, "Failed to auto-detect a valid GID index. Try setting it manually through IB_GID_INDEX"};
}
gidInfo.first = gidIndex;
gidInfo.second = GidPriorityStr[highestPriority];
return ERR_NONE;
}
static vector<IbvDevice>& GetIbvDeviceList() static vector<IbvDevice>& GetIbvDeviceList()
{ {
static bool isInitialized = false; static bool isInitialized = false;
...@@ -1425,14 +1582,27 @@ namespace { ...@@ -1425,14 +1582,27 @@ namespace {
if (context) { if (context) {
struct ibv_device_attr deviceAttr; struct ibv_device_attr deviceAttr;
if (!ibv_query_device(context, &deviceAttr)) { if (!ibv_query_device(context, &deviceAttr)) {
int activePort;
ibvDevice.gidIndex = -1;
for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) { for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) {
struct ibv_port_attr portAttr; struct ibv_port_attr portAttr;
if (ibv_query_port(context, port, &portAttr)) continue; if (ibv_query_port(context, port, &portAttr)) continue;
if (portAttr.state == IBV_PORT_ACTIVE) if (portAttr.state == IBV_PORT_ACTIVE) {
activePort = port;
ibvDevice.hasActivePort = true; ibvDevice.hasActivePort = true;
if(portAttr.link_layer == IBV_LINK_LAYER_ETHERNET) {
ibvDevice.isRoce = true;
std::pair<int, std::string> gidInfo (-1, "");
auto res = GetGidIndex(context, portAttr.gid_tbl_len, activePort, gidInfo);
if (res.errType == ERR_NONE) {
ibvDevice.gidIndex = gidInfo.first;
ibvDevice.gidDescriptor = gidInfo.second;
}
}
break; break;
} }
} }
}
ibv_close_device(context); ibv_close_device(context);
} }
} }
...@@ -1781,164 +1951,6 @@ namespace { ...@@ -1781,164 +1951,6 @@ namespace {
return ERR_NONE; return ERR_NONE;
} }
static bool IsConfiguredGid(union ibv_gid* gid)
{
const struct in6_addr *a = (struct in6_addr *)gid->raw;
int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
if (((a->s6_addr32[0] | trailer) == 0UL) ||
((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
return false;
}
return true;
}
static bool LinkLocalGid(union ibv_gid* gid)
{
const struct in6_addr *a = (struct in6_addr *)gid->raw;
if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
return true;
}
return false;
}
static bool IsValidGid(union ibv_gid* gid)
{
return (IsConfiguredGid(gid) && !LinkLocalGid(gid));
}
static sa_family_t GetGidAddressFamily(union ibv_gid* gid)
{
const struct in6_addr *a = (struct in6_addr *)gid->raw;
bool isIpV4Mapped = ((a->s6_addr32[0] | a->s6_addr32[1]) |
(a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL;
bool isIpV4MappedMulticast = (a->s6_addr32[0] == htonl(0xff0e0000) &&
((a->s6_addr32[1] |
(a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL));
return (isIpV4Mapped || isIpV4MappedMulticast) ? AF_INET : AF_INET6;
}
static bool MatchGidAddressFamily(sa_family_t const& af,
void* prefix,
int prefixLen,
union ibv_gid* gid)
{
struct in_addr *base = NULL;
struct in6_addr *base6 = NULL;
struct in6_addr *addr6 = NULL;;
if (af == AF_INET) {
base = (struct in_addr *)prefix;
} else {
base6 = (struct in6_addr *)prefix;
}
addr6 = (struct in6_addr *)gid->raw;
#define NETMASK(bits) (htonl(0xffffffff ^ ((1 << (32 - bits)) - 1)))
int i = 0;
while (prefixLen > 0 && i < 4) {
if (af == AF_INET) {
int mask = NETMASK(prefixLen);
if ((base->s_addr & mask) ^ (addr6->s6_addr32[3] & mask))
break;
prefixLen = 0;
break;
} else {
if (prefixLen >= 32) {
if (base6->s6_addr32[i] ^ addr6->s6_addr32[i])
break;
prefixLen -= 32;
++i;
} else {
int mask = NETMASK(prefixLen);
if ((base6->s6_addr32[i] & mask) ^ (addr6->s6_addr32[i] & mask))
break;
prefixLen = 0;
}
}
}
return (prefixLen == 0) ? true : false;
#undef NETMASK
}
static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
int const& portNum,
int const& gidIndex,
int* version)
{
char const* deviceName = ibv_get_device_name(context->device);
char gidRoceVerStr[16] = {};
char roceTypePath[PATH_MAX] = {};
sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
deviceName, portNum, gidIndex);
int fd = open(roceTypePath, O_RDONLY);
if (fd == -1)
return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath};
int ret = read(fd, gidRoceVerStr, 15);
close(fd);
if (ret == -1)
return {ERR_FATAL, "Failed while reading RoCE version"};
if (strlen(gidRoceVerStr)) {
if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0
|| strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
*version = 1;
}
else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
*version = 2;
}
}
return ERR_NONE;
}
static ErrResult GetGidIndex(ConfigOptions const& cfg,
struct ibv_context* context,
int const& gidTblLen,
int& gidIndex)
{
// Use GID index if user specified
if (gidIndex >= 0) return ERR_NONE;
// Try to find the best GID index
int port = cfg.nic.ibPort;
sa_family_t targetAddrFam = (cfg.nic.ipAddressFamily == 6)? AF_INET6 : AF_INET;
int targetRoCEVer = cfg.nic.roceVersion;
// Initially assume gidIndex = 0
int gidIndexCurr = 0;
union ibv_gid gidCurr;
IBV_CALL(ibv_query_gid, context, port, gidIndexCurr, &gidCurr);
sa_family_t gidCurrFam = GetGidAddressFamily(&gidCurr);
bool gidCurrIsValid = IsValidGid(&gidCurr);
int gidCurrRoceVersion;
ERR_CHECK(GetRoceVersionNumber(context, port, gidIndexCurr, &gidCurrRoceVersion));
// Loop over GID table to find the best match
for (int gidIndexTest = 1; gidIndexTest < gidTblLen; ++gidIndexTest) {
union ibv_gid gidTest;
IBV_CALL(ibv_query_gid, context, cfg.nic.ibPort, gidIndexTest, &gidTest);
if (!IsValidGid(&gidTest)) continue;
sa_family_t gidTestFam = GetGidAddressFamily(&gidTest);
bool gidTestMatchSubnet = MatchGidAddressFamily(targetAddrFam, NULL, 0, &gidTest);
int gidTestRoceVersion;
ERR_CHECK(GetRoceVersionNumber(context, port, gidIndexTest, &gidTestRoceVersion));
if (!gidCurrIsValid ||
(gidTestFam == targetAddrFam && gidTestMatchSubnet &&
(gidCurrFam != targetAddrFam || gidTestRoceVersion == targetRoCEVer))) {
// Switch to better match
gidIndexCurr = gidIndexTest;
gidCurrFam = gidTestFam;
gidCurrIsValid = true;
gidCurrRoceVersion = gidTestRoceVersion;
}
}
gidIndex = gidIndexCurr;
return ERR_NONE;
}
static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg, static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg,
ExeDevice const& srcExeDevice, ExeDevice const& srcExeDevice,
Transfer const& t, Transfer const& t,
...@@ -2012,8 +2024,12 @@ namespace { ...@@ -2012,8 +2024,12 @@ namespace {
bool isRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET); bool isRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET);
if (isRoCE) { if (isRoCE) {
// Try to auto-detect the GID index // Try to auto-detect the GID index
ERR_CHECK(GetGidIndex(cfg, rss.srcContext, rss.srcPortAttr.gid_tbl_len, srcGidIndex)); std::pair<int, std::string> srcGidInfo (srcGidIndex, "");
ERR_CHECK(GetGidIndex(cfg, rss.dstContext, rss.dstPortAttr.gid_tbl_len, dstGidIndex)); std::pair<int, std::string> dstGidInfo (dstGidIndex, "");
ERR_CHECK(GetGidIndex(rss.srcContext, rss.srcPortAttr.gid_tbl_len, cfg.nic.ibPort, srcGidInfo));
ERR_CHECK(GetGidIndex(rss.dstContext, rss.dstPortAttr.gid_tbl_len, cfg.nic.ibPort, dstGidInfo));
srcGidIndex = srcGidInfo.first;
dstGidIndex = dstGidInfo.first;
IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid); IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid);
IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid); IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid);
} }
...@@ -2396,6 +2412,8 @@ namespace { ...@@ -2396,6 +2412,8 @@ namespace {
exeDevice.exeIndex)); exeDevice.exeIndex));
#endif #endif
int transferOffset = 0; int transferOffset = 0;
if (cfg.gfx.useMultiStream || cfg.gfx.blockOrder == 0) {
// Threadblocks are ordered sequentially one transfer at a time
for (auto& rss : exeInfo.resources) { for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx]; Transfer const& t = transfers[rss.transferIdx];
rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset; rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
...@@ -2405,6 +2423,38 @@ namespace { ...@@ -2405,6 +2423,38 @@ namespace {
transferOffset++; transferOffset++;
} }
} }
} else if (cfg.gfx.blockOrder == 1) {
// Interleave threadblocks of different Transfers
for (int subExecIdx = 0; exeInfo.subExecParamCpu.size() < exeInfo.totalSubExecs; ++subExecIdx) {
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
if (subExecIdx < t.numSubExecs) {
rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[subExecIdx]);
}
}
}
} else if (cfg.gfx.blockOrder == 2) {
// Build randomized threadblock list
std::vector<std::pair<int,int>> indices;
for (int i = 0; i < exeInfo.resources.size(); i++) {
auto const& rss = exeInfo.resources[i];
Transfer const& t = transfers[rss.transferIdx];
for (int j = 0; j < t.numSubExecs; j++)
indices.push_back(std::make_pair(i,j));
}
std::random_device rd;
std::default_random_engine gen(rd());
std::shuffle(indices.begin(), indices.end(), gen);
// Build randomized threadblock list
for (auto p : indices) {
auto& rss = exeInfo.resources[p.first];
rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[p.second]);
}
}
// Copy sub executor parameters to GPU // Copy sub executor parameters to GPU
ERR_CHECK(hipSetDevice(exeDevice.exeIndex)); ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
...@@ -2595,17 +2645,8 @@ namespace { ...@@ -2595,17 +2645,8 @@ namespace {
int const exeIndex, int const exeIndex,
TransferResources& rss) TransferResources& rss)
{ {
auto cpuStart = std::chrono::high_resolution_clock::now();
// Switch to the closest NUMA node to this NIC
if (cfg.nic.useNuma) {
int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
if (numaNode != -1)
numa_run_on_node(numaNode);
}
int subIteration = 0;
do {
// Loop over each of the queue pairs and post the send // Loop over each of the queue pairs and post the send
ibv_send_wr* badWorkReq; ibv_send_wr* badWorkReq;
for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) { for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
...@@ -2614,61 +2655,68 @@ namespace { ...@@ -2614,61 +2655,68 @@ namespace {
return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n", return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n",
rss.transferIdx, qpIndex, error}; rss.transferIdx, qpIndex, error};
} }
return ERR_NONE;
}
// Execution of a single NIC executor
static ErrResult RunNicExecutor(int const iteration,
ConfigOptions const& cfg,
int const exeIndex,
ExeInfo& exeInfo)
{
// Switch to the closest NUMA node to this NIC
if (cfg.nic.useNuma) {
int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
if (numaNode != -1)
numa_run_on_node(numaNode);
}
int subIterations = 0;
do {
auto cpuStart = std::chrono::high_resolution_clock::now();
size_t completedTransfers = 0;
auto transferCount = exeInfo.resources.size();
std::vector<uint8_t> receivedQPs(transferCount);
std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
// post the sends
for (auto i = 0; i < transferCount; i++) {
transferTimers[i] = std::chrono::high_resolution_clock::now();
ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
}
// poll for completions
do {
for (auto i = 0; i < transferCount; i++) {
if(receivedQPs[i] < exeInfo.resources[i].qpCount) {
auto& rss = exeInfo.resources[i];
// Poll the completion queue until all queue pairs are complete // Poll the completion queue until all queue pairs are complete
// The order of completion doesn't matter because this completion queue is dedicated to this Transfer // The order of completion doesn't matter because this completion queue is dedicated to this Transfer
int numComplete = 0;
ibv_wc wc; ibv_wc wc;
while (numComplete < rss.qpCount) {
int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc); int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc);
if (nc > 0) { if (nc > 0) {
numComplete++; receivedQPs[i]++;
if (wc.status != IBV_WC_SUCCESS) { if (wc.status != IBV_WC_SUCCESS) {
return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion", rss.transferIdx}; return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion", rss.transferIdx};
} }
} else if (nc < 0) { } else if (nc < 0) {
return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx}; return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx};
} }
} if(receivedQPs[i] == rss.qpCount) {
} while (++subIteration != cfg.general.numSubIterations); auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i];
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) { if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(deltaMsec); rss.perIterMsec.push_back(deltaMsec);
} }
return ERR_NONE; completedTransfers++;
} }
// Execution of a single NIC executor
static ErrResult RunNicExecutor(int const iteration,
ConfigOptions const& cfg,
int const exeIndex,
ExeInfo& exeInfo)
{
vector<std::future<ErrResult>> asyncTransfers;
auto cpuStart = std::chrono::high_resolution_clock::now();
for (int i = 0; i < exeInfo.resources.size(); i++) {
asyncTransfers.emplace_back(std::async(std::launch::async,
ExecuteNicTransfer,
iteration,
std::cref(cfg),
exeIndex,
std::ref(exeInfo.resources[i])));
} }
for (auto& asyncTransfer : asyncTransfers) }
ERR_CHECK(asyncTransfer.get()); } while(completedTransfers < transferCount);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec; exeInfo.totalDurationMsec += deltaMsec;
} while(++subIterations < cfg.general.numSubIterations);
return ERR_NONE; return ERR_NONE;
} }
#endif #endif
...@@ -2704,13 +2752,16 @@ namespace { ...@@ -2704,13 +2752,16 @@ namespace {
// Helper function for memset // Helper function for memset
template <typename T> __device__ __forceinline__ T MemsetVal(); template <typename T> __device__ __forceinline__ T MemsetVal();
template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; }; template <> __device__ __forceinline__ float MemsetVal(){ return MEMSET_VAL; };
template <> __device__ __forceinline__ float2 MemsetVal(){ return make_float2(MEMSET_VAL,
MEMSET_VAL); };
template <> __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, template <> __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL,
MEMSET_VAL, MEMSET_VAL,
MEMSET_VAL, MEMSET_VAL,
MEMSET_VAL); } MEMSET_VAL); }
// Kernel for GFX execution
template <int BLOCKSIZE, int UNROLL> // Kernel for GFX execution
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL>
__global__ void __launch_bounds__(BLOCKSIZE) __global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations) GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
{ {
...@@ -2729,10 +2780,10 @@ namespace { ...@@ -2729,10 +2780,10 @@ namespace {
// Collect data information // Collect data information
int32_t const numSrcs = p.numSrcs; int32_t const numSrcs = p.numSrcs;
int32_t const numDsts = p.numDsts; int32_t const numDsts = p.numDsts;
float4 const* __restrict__ srcFloat4[MAX_SRCS]; PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
float4* __restrict__ dstFloat4[MAX_DSTS]; PACKED_FLOAT* __restrict__ dstFloatPacked[MAX_DSTS];
for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i]; for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i];
for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i]; for (int i = 0; i < numDsts; i++) dstFloatPacked[i] = (PACKED_FLOAT*)p.dst[i];
// Operate on wavefront granularity // Operate on wavefront granularity
int32_t const nTeams = p.teamSize; // Number of threadblocks working together on this subarray int32_t const nTeams = p.teamSize; // Number of threadblocks working together on this subarray
...@@ -2741,7 +2792,7 @@ namespace { ...@@ -2741,7 +2792,7 @@ namespace {
int32_t const waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock int32_t const waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront int32_t const tIdx = threadIdx.x % warpSize; // Thread index within wavefront
size_t const numFloat4 = p.N / 4; size_t const numPackedFloat = p.N / (sizeof(PACKED_FLOAT)/sizeof(float));
int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2; int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
switch (waveOrder) { switch (waveOrder) {
...@@ -2755,64 +2806,64 @@ namespace { ...@@ -2755,64 +2806,64 @@ namespace {
int subIterations = 0; int subIterations = 0;
while (1) { while (1) {
// First loop: Each wavefront in the team works on UNROLL float4s per thread // First loop: Each wavefront in the team works on UNROLL PACKED_FLOAT per thread
size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize; size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride; size_t const loop1Limit = numPackedFloat / loop1Stride * loop1Stride;
{ {
float4 val[UNROLL]; PACKED_FLOAT val[UNROLL];
if (numSrcs == 0) { if (numSrcs == 0) {
#pragma unroll #pragma unroll
for (int u = 0; u < UNROLL; u++) for (int u = 0; u < UNROLL; u++)
val[u] = MemsetVal<float4>(); val[u] = MemsetVal<PACKED_FLOAT>();
} }
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) { for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
// Read sources into memory and accumulate in registers // Read sources into memory and accumulate in registers
if (numSrcs) { if (numSrcs) {
for (int u = 0; u < UNROLL; u++) for (int u = 0; u < UNROLL; u++)
val[u] = srcFloat4[0][idx + u * unrlStride * warpSize]; val[u] = srcFloatPacked[0][idx + u * unrlStride * warpSize];
for (int s = 1; s < numSrcs; s++) for (int s = 1; s < numSrcs; s++)
for (int u = 0; u < UNROLL; u++) for (int u = 0; u < UNROLL; u++)
val[u] += srcFloat4[s][idx + u * unrlStride * warpSize]; val[u] += srcFloatPacked[s][idx + u * unrlStride * warpSize];
} }
// Write accumulation to all outputs // Write accumulation to all outputs
for (int d = 0; d < numDsts; d++) { for (int d = 0; d < numDsts; d++) {
#pragma unroll #pragma unroll
for (int u = 0; u < UNROLL; u++) for (int u = 0; u < UNROLL; u++)
dstFloat4[d][idx + u * unrlStride * warpSize] = val[u]; dstFloatPacked[d][idx + u * unrlStride * warpSize] = val[u];
} }
} }
} }
// Second loop: Deal with remaining float4s // Second loop: Deal with remaining PACKED_FLOAT
{ {
if (loop1Limit < numFloat4) { if (loop1Limit < numPackedFloat) {
float4 val; PACKED_FLOAT val;
if (numSrcs == 0) val = MemsetVal<float4>(); if (numSrcs == 0) val = MemsetVal<PACKED_FLOAT>();
size_t const loop2Stride = nTeams * nWaves * warpSize; size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
idx < numFloat4; idx += loop2Stride) { idx < numPackedFloat; idx += loop2Stride) {
if (numSrcs) { if (numSrcs) {
val = srcFloat4[0][idx]; val = srcFloatPacked[0][idx];
for (int s = 1; s < numSrcs; s++) for (int s = 1; s < numSrcs; s++)
val += srcFloat4[s][idx]; val += srcFloatPacked[s][idx];
} }
for (int d = 0; d < numDsts; d++) for (int d = 0; d < numDsts; d++)
dstFloat4[d][idx] = val; dstFloatPacked[d][idx] = val;
} }
} }
} }
// Third loop; Deal with remaining floats // Third loop; Deal with remaining floats
{ {
if (numFloat4 * 4 < p.N) { if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) {
float val; float val;
if (numSrcs == 0) val = MemsetVal<float>(); if (numSrcs == 0) val = MemsetVal<float>();
size_t const loop3Stride = nTeams * nWaves * warpSize; size_t const loop3Stride = nTeams * nWaves * warpSize;
for ( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) { for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
if (numSrcs) { if (numSrcs) {
val = p.src[0][idx]; val = p.src[0][idx];
for (int s = 1; s < numSrcs; s++) for (int s = 1; s < numSrcs; s++)
...@@ -2839,19 +2890,24 @@ namespace { ...@@ -2839,19 +2890,24 @@ namespace {
} }
} }
#define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL) \
{GpuReduceKernel<float, BLOCKSIZE, UNROLL>, \
GpuReduceKernel<float2, BLOCKSIZE, UNROLL>, \
GpuReduceKernel<float4, BLOCKSIZE, UNROLL>}
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \ #define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{GpuReduceKernel<BLOCKSIZE, 1>, \ {GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1), \
GpuReduceKernel<BLOCKSIZE, 2>, \ GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 2), \
GpuReduceKernel<BLOCKSIZE, 3>, \ GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 3), \
GpuReduceKernel<BLOCKSIZE, 4>, \ GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 4), \
GpuReduceKernel<BLOCKSIZE, 5>, \ GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 5), \
GpuReduceKernel<BLOCKSIZE, 6>, \ GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 6), \
GpuReduceKernel<BLOCKSIZE, 7>, \ GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7), \
GpuReduceKernel<BLOCKSIZE, 8>} GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)}
// Table of all GPU Reduction kernel functions (templated blocksize / unroll) // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int); typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] = GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3] =
{ {
GPU_KERNEL_UNROLL_DECL(64), GPU_KERNEL_UNROLL_DECL(64),
GPU_KERNEL_UNROLL_DECL(128), GPU_KERNEL_UNROLL_DECL(128),
...@@ -2879,18 +2935,19 @@ namespace { ...@@ -2879,18 +2935,19 @@ namespace {
dim3 const gridSize(xccDim, numSubExecs, 1); dim3 const gridSize(xccDim, numSubExecs, 1);
dim3 const blockSize(cfg.gfx.blockSize, 1); dim3 const blockSize(cfg.gfx.blockSize, 1);
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 :
2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
#if defined(__NVCC__) #if defined(__NVCC__)
if (startEvent != NULL) if (startEvent != NULL)
ERR_CHECK(hipEventRecord(startEvent, stream)); ERR_CHECK(hipEventRecord(startEvent, stream));
gpuKernel<<<gridSize, blockSize, 0, stream>>>(rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
<<<gridSize, blockSize, 0, stream>>>
(rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (stopEvent != NULL) if (stopEvent != NULL)
ERR_CHECK(hipEventRecord(stopEvent, stream)); ERR_CHECK(hipEventRecord(stopEvent, stream));
#else #else
hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1], hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, startEvent, stopEvent,
gridSize, blockSize, 0, stream, startEvent, stopEvent,
0, rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations); 0, rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
#endif #endif
...@@ -2954,19 +3011,19 @@ namespace { ...@@ -2954,19 +3011,19 @@ namespace {
dim3 const blockSize(cfg.gfx.blockSize, 1); dim3 const blockSize(cfg.gfx.blockSize, 1);
hipStream_t stream = exeInfo.streams[0]; hipStream_t stream = exeInfo.streams[0];
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 :
2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
#if defined(__NVCC__) #if defined(__NVCC__)
if (cfg.gfx.useHipEvents) if (cfg.gfx.useHipEvents)
ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream)); ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream));
gpuKernel<<<gridSize, blockSize, 0 , stream>>>(exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
<<<gridSize, blockSize, 0 , stream>>>
(exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (cfg.gfx.useHipEvents) if (cfg.gfx.useHipEvents)
ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream)); ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream));
#else #else
hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1], hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream,
gridSize, blockSize, 0, stream,
cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL, cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0, cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0,
exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations); exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment