Unverified Commit 5984f49e authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench V1.59 (#162)



Adding NIC execution capabilities, various bug fixes introduced by header-only-library refactor
---------
Co-authored-by: default avatarMustafa Abduljabbar <mustafa.abduljabbar@amd.com>
parent fcac6d92
...@@ -3,6 +3,30 @@ ...@@ -3,6 +3,30 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.59.00
### Added
- Adding in support for NIC executor, which allows for RDMA copies on NICs that support IBVerbs
By default, NIC executor will be enabled if IBVerbs is found in the dynamic linker cache
- NIC executor can be indexed in two methods
- "I" Ix.y will use NIC x as the source and NIC y as the destination.
E.g. (G0 I0.5 G4)
- "N" Nx.y will use NIC closest to GPU x as source, and NIC closest to GPU y as destination
E.g. (G0 N0.4 N4)
- The closest NIC can be overridden by the environment variable CLOSEST_NIC, which should be a comma-separated
list of NIC indices to use for the corresponding GPU
- This feature can be explicitly disabled at compile time by specifying DISABLE_NIC_EXEC=1
### Modified
- Changing default data size to 256M from 64M
- Adding NUM_QUEUE_PAIRS which enables NIC traffic in A2A. Each GPU will talk to the next GPU via the closest NIC
- Sweep preset now saves last sweep run configuration to /tmp/lastSweep.cfg and can be changed via SWEEP_FILE
### Fixed
- Fixed bug with reporting when using subiterations
- Fixed bug with per-Transfer data size specification
- Fixed bug when using XCC prefered table
## v1.58.00 ## v1.58.00
### Fixed ### Fixed
- Fixed broken specific DMA-engine copies - Fixed broken specific DMA-engine copies
......
...@@ -7,7 +7,7 @@ else() ...@@ -7,7 +7,7 @@ else()
endif() endif()
cmake_minimum_required(VERSION 3.5) cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.58.00 LANGUAGES CXX) project(TransferBench VERSION 1.59.00 LANGUAGES CXX)
# Default GPU architectures to build # Default GPU architectures to build
#================================================================================================== #==================================================================================================
...@@ -56,6 +56,18 @@ set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}") ...@@ -56,6 +56,18 @@ set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib") set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
include_directories(${ROCM_PATH}/include) include_directories(${ROCM_PATH}/include)
find_library(IBVERBS_LIBRARY ibverbs)
if (IBVERBS_LIBRARY)
if (DEFINED ENV{DISABLE_NIC_EXEC})
message(STATUS "Disabling NIC Executor support")
else()
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs)
endif()
else()
message(WARNING "IBVerbs library not found. Building without NIC executor support")
endif()
link_libraries(numa hsa-runtime64 pthread) link_libraries(numa hsa-runtime64 pthread)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .) set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp) add_executable(TransferBench src/client/Client.cpp)
......
...@@ -11,9 +11,9 @@ NVCC=$(CUDA_PATH)/bin/nvcc ...@@ -11,9 +11,9 @@ NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected # Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found") ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda EXE=TransferBenchCuda
else else
EXE=TransferBench EXE=TransferBench
endif endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
...@@ -21,13 +21,30 @@ NVFLAGS = -x cu -lnuma -arch=native ...@@ -21,13 +21,30 @@ NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread LDFLAGS += -lpthread
# Compile RDMA executor if IBVerbs is found in the Dynamic Linker cache
NIC_ENABLED = 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifneq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
NIC_ENABLED = 1
endif
endif
all: $(EXE) all: $(EXE)
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS) $(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS) $(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
clean: clean:
rm -f *.o ./TransferBench ./TransferBenchCuda rm -f *.o ./TransferBench ./TransferBenchCuda
NicStatus:
ifeq ($(NIC_ENABLED), 1)
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
else
$(info Building without NIC executor support)
endif
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# 1) CPU CPU thread # 1) CPU CPU thread
# 2) GPU GPU threadblock/Compute Unit (CU) # 2) GPU GPU threadblock/Compute Unit (CU)
# 3) DMA N/A. (May only be used for copies (single SRC/DST) # 3) DMA N/A. (May only be used for copies (single SRC/DST)
# 4) NIC Queue Pair
# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
...@@ -34,9 +35,11 @@ ...@@ -34,9 +35,11 @@
# #SEs : Number of SubExectors to use (CPU threads/ GPU threadblocks) # #SEs : Number of SubExectors to use (CPU threads/ GPU threadblocks)
# srcMemL : Source memory locations (Where the data is to be read from) # srcMemL : Source memory locations (Where the data is to be read from)
# Executor : Executor is specified by a character indicating type, followed by device index (0-indexed) # Executor : Executor is specified by a character indicating type, followed by device index (0-indexed)
# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1) # - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)
# - G: GPU-executed (Indexed from 0 to # GPUs - 1) # - G: GPU-executed (Indexed from 0 to # GPUs - 1)
# - D: DMA-executor (Indexed from 0 to # GPUs - 1) # - D: DMA-executor (Indexed from 0 to # GPUs - 1)
# - I#.#: NIC executor (Indexed from 0 to # NICs - 1)
# - N#.#: Nearest NIC executor (Indexed from 0 to # GPUs - 1)
# dstMemL : Destination memory locations (Where the data is to be written to) # dstMemL : Destination memory locations (Where the data is to be written to)
# bytesL : Number of bytes to copy (0 means use command-line specified size) # bytesL : Number of bytes to copy (0 means use command-line specified size)
# Must be a multiple of 4 and may be suffixed with ('K','M', or 'G') # Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
...@@ -56,7 +59,10 @@ ...@@ -56,7 +59,10 @@
# 1 4 (C1->G2->G0) Uses 4 CUs on GPU2 to copy from CPU1 to GPU0 # 1 4 (C1->G2->G0) Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
# 2 4 G0->G0->G1 G1->G1->G0 Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs # 2 4 G0->G0->G1 G1->G1->G0 Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs # -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs
# 1 2 (F0->I0.2->F1) Uses 2 QPs to transfer data from GPU0 via NIC0 to GPU1 via NIC2
# 1 1 (F0->N0.1->F1) Uses 1 QP to transfer data from GPU0 via GPU0's closest NIC to GPU1 via GPU1's closest NIC
# -2 (G0->N0.1->G1 2 128M) (G1->N1.0->G0 1 256M) Uses Nearest NIC executor to copy 128Mb from GPU0 to GPU1 with 2 QPs,
# and 256Mb from GPU1 to GPU0 with 1 QP
# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary # Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
# Lines starting with # will be ignored. Lines starting with ## will be echoed to output # Lines starting with # will be ignored. Lines starting with ## will be echoed to output
......
...@@ -121,13 +121,23 @@ int main(int argc, char **argv) { ...@@ -121,13 +121,23 @@ int main(int argc, char **argv) {
} }
} }
// Track which transfers have already numBytes specified
std::vector<bool> bytesSpecified(transfers.size());
int hasUnspecified = false;
for (int i = 0; i < transfers.size(); i++) {
bytesSpecified[i] = (transfers[i].numBytes != 0);
if (transfers[i].numBytes == 0) hasUnspecified = true;
}
// Run the specified numbers of bytes otherwise generate a range of values // Run the specified numbers of bytes otherwise generate a range of values
for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) { for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor); size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer; size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
do { do {
for (auto& t : transfers) for (int i = 0; i < transfers.size(); i++) {
t.numBytes = currBytes; if (!bytesSpecified[i])
transfers[i].numBytes = currBytes;
}
if (maxVarCount == 0) { if (maxVarCount == 0) {
if (TransferBench::RunTransfers(cfgOptions, transfers, results)) { if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
...@@ -162,17 +172,21 @@ int main(int argc, char **argv) { ...@@ -162,17 +172,21 @@ int main(int argc, char **argv) {
PrintResults(ev, ++testNum, bestTransfers, bestResults); PrintResults(ev, ++testNum, bestTransfers, bestResults);
PrintErrors(bestResults.errResults); PrintErrors(bestResults.errResults);
} }
if (numBytesPerTransfer != 0) break; if (numBytesPerTransfer != 0 || !hasUnspecified) break;
currBytes += deltaBytes; currBytes += deltaBytes;
} while (currBytes < bytes * 2); } while (currBytes < bytes * 2);
if (numBytesPerTransfer != 0) break; if (numBytesPerTransfer != 0 || !hasUnspecified) break;
} }
} }
} }
void DisplayUsage(char const* cmdName) void DisplayUsage(char const* cmdName)
{ {
printf("TransferBench v%s.%s\n", TransferBench::VERSION, CLIENT_VERSION); std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
printf("TransferBench v%s.%s%s\n", TransferBench::VERSION, CLIENT_VERSION, nicSupport.c_str());
printf("========================================\n"); printf("========================================\n");
if (numa_available() == -1) { if (numa_available() == -1) {
...@@ -218,7 +232,7 @@ void PrintResults(EnvVars const& ev, int const testNum, ...@@ -218,7 +232,7 @@ void PrintResults(EnvVars const& ev, int const testNum,
ExeType const exeType = exeDevice.exeType; ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex; int32_t const exeIndex = exeDevice.exeIndex;
printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n", printf(" Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep, ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec); exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);
...@@ -230,14 +244,15 @@ void PrintResults(EnvVars const& ev, int const testNum, ...@@ -230,14 +244,15 @@ void PrintResults(EnvVars const& ev, int const testNum,
char exeSubIndexStr[32] = ""; char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1) if (t.exeSubIndex != -1)
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex); sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
printf(" Transfer %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %c%03d%s:%03d -> %s\n",
printf(" Transfer %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
idx, sep, idx, sep,
r.avgBandwidthGbPerSec, sep, r.avgBandwidthGbPerSec, sep,
r.avgDurationMsec, sep, r.avgDurationMsec, sep,
r.numBytes, sep, r.numBytes, sep,
MemDevicesToStr(t.srcs).c_str(), ExeTypeName[exeType], exeIndex, MemDevicesToStr(t.srcs).c_str(),
exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str()); TransferBench::ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
// Show per-iteration timing information // Show per-iteration timing information
if (ev.showIterations) { if (ev.showIterations) {
...@@ -269,7 +284,7 @@ void PrintResults(EnvVars const& ev, int const testNum, ...@@ -269,7 +284,7 @@ void PrintResults(EnvVars const& ev, int const testNum,
for (auto& time : times) { for (auto& time : times) {
double iterDurationMsec = time.first; double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f; double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep); printf(" Iter %03d %c %8.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
std::set<int> usedXccs; std::set<int> usedXccs;
if (time.second - 1 < r.perIterCUs.size()) { if (time.second - 1 < r.perIterCUs.size()) {
...@@ -285,11 +300,11 @@ void PrintResults(EnvVars const& ev, int const testNum, ...@@ -285,11 +300,11 @@ void PrintResults(EnvVars const& ev, int const testNum,
printf(" %02d", x); printf(" %02d", x);
printf("\n"); printf("\n");
} }
printf(" StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep); printf(" StandardDev %c %8.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
} }
} }
} }
printf(" Aggregate (CPU) %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n", printf(" Aggregate (CPU) %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
sep, results.avgTotalBandwidthGbPerSec, sep, results.avgTotalBandwidthGbPerSec,
sep, results.avgTotalDurationMsec, sep, results.avgTotalDurationMsec,
sep, results.totalBytesTransferred, sep, results.totalBytesTransferred,
......
...@@ -28,9 +28,9 @@ THE SOFTWARE. ...@@ -28,9 +28,9 @@ THE SOFTWARE.
#include "TransferBench.hpp" #include "TransferBench.hpp"
#include "EnvVars.hpp" #include "EnvVars.hpp"
size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26); size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<28);
char const ExeTypeName[4][4] = {"CPU", "GPU", "DMA", "IBV"}; char const ExeTypeName[5][4] = {"CPU", "GPU", "DMA", "NIC", "NIC"};
// Display detected hardware // Display detected hardware
void DisplayTopology(bool outputToCsv); void DisplayTopology(bool outputToCsv);
......
...@@ -100,6 +100,14 @@ public: ...@@ -100,6 +100,14 @@ public:
int outputToCsv; // Output in CSV format int outputToCsv; // Output in CSV format
int samplingFactor; // Affects how many different values of N are generated (when N set to 0) int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
// NIC options
int ibGidIndex; // GID Index for RoCE NICs
int roceVersion; // RoCE version number
int ipAddressFamily; // IP Address Famliy
uint8_t ibPort; // NIC port number to be used
int nicRelaxedOrder; // Use relaxed ordering for RDMA
std::string closestNicStr; // Holds the user-specified list of closest NICs
// Developer features // Developer features
int gpuMaxHwQueues; // Tracks GPU_MAX_HW_QUEUES environment variable int gpuMaxHwQueues; // Tracks GPU_MAX_HW_QUEUES environment variable
...@@ -147,8 +155,16 @@ public: ...@@ -147,8 +155,16 @@ public:
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0); validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
validateSource = GetEnvVar("VALIDATE_SOURCE" , 0); validateSource = GetEnvVar("VALIDATE_SOURCE" , 0);
ibGidIndex = GetEnvVar("IB_GID_INDEX" ,-1);
ibPort = GetEnvVar("IB_PORT_NUMBER" , 1);
roceVersion = GetEnvVar("ROCE_VERSION" , 2);
ipAddressFamily = GetEnvVar("IP_ADDRESS_FAMILY" , 4);
nicRelaxedOrder = GetEnvVar("NIC_RELAX_ORDER" , 1);
closestNicStr = GetEnvVar("CLOSEST_NIC" , "");
gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES" , 4); gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES" , 4);
// Check for fill pattern // Check for fill pattern
char* pattern = getenv("FILL_PATTERN"); char* pattern = getenv("FILL_PATTERN");
if (pattern != NULL) { if (pattern != NULL) {
...@@ -279,18 +295,32 @@ public: ...@@ -279,18 +295,32 @@ public:
printf(" BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n"); printf(" BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n"); printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n"); printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n");
#if NIC_EXEC_ENABLED
printf(" CLOSEST_NIC - Comma-separated list of per-GPU closest NIC (default=auto)\n");
#endif
printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n"); printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n"); printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL)); printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n"); printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n"); printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n"); printf(" HIDE_ENV - Hide environment variable value listing\n");
#if NIC_EXEC_ENABLED
printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n");
printf(" IB_PORT_NUMBER - RDMA port count for RDMA NIC (default=1)\n");
printf(" IP_ADDRESS_FAMILY - IP address family (4=v4, 6=v6, default=v4)\n");
#endif
printf(" MIN_VAR_SUBEXEC - Minumum # of subexecutors to use for variable subExec Transfers\n"); printf(" MIN_VAR_SUBEXEC - Minumum # of subexecutors to use for variable subExec Transfers\n");
printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n"); printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
#if NIC_EXEC_ENABLED
printf(" NIC_RELAX_ORDER - Set to non-zero to use relaxed ordering");
#endif
printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n"); printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n");
printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n"); printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
printf(" NUM_WARMUPS - # of untimed warmup iterations per test\n"); printf(" NUM_WARMUPS - # of untimed warmup iterations per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
#if NIC_EXEC_ENABLED
printf(" ROCE_VERSION - RoCE version (default=2)\n");
#endif
printf(" SAMPLING_FACTOR - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n"); printf(" SAMPLING_FACTOR - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info\n"); printf(" SHOW_ITERATIONS - Show per-iteration timing info\n");
printf(" USE_HIP_EVENTS - Use HIP events for GFX executor timing\n"); printf(" USE_HIP_EVENTS - Use HIP events for GFX executor timing\n");
...@@ -301,6 +331,7 @@ public: ...@@ -301,6 +331,7 @@ public:
printf(" VALIDATE_SOURCE - Validate GPU src memory immediately after preparation\n"); printf(" VALIDATE_SOURCE - Validate GPU src memory immediately after preparation\n");
} }
void Print(std::string const& name, int32_t const value, const char* format, ...) const void Print(std::string const& name, int32_t const value, const char* format, ...) const
{ {
printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : "); printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : ");
...@@ -325,9 +356,12 @@ public: ...@@ -325,9 +356,12 @@ public:
void DisplayEnvVars() const void DisplayEnvVars() const
{ {
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX); int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
if (!outputToCsv) { if (!outputToCsv) {
printf("TransferBench v%s.%s\n", TransferBench::VERSION, CLIENT_VERSION); printf("TransferBench v%s.%s%s\n", TransferBench::VERSION, CLIENT_VERSION, nicSupport.c_str());
printf("===============================================================\n"); printf("===============================================================\n");
if (!hideEnv) printf("[Common] (Suppress by setting HIDE_ENV=1)\n"); if (!hideEnv) printf("[Common] (Suppress by setting HIDE_ENV=1)\n");
} }
...@@ -341,6 +375,10 @@ public: ...@@ -341,6 +375,10 @@ public:
"Each CU gets a mulitple of %d bytes to copy", blockBytes); "Each CU gets a mulitple of %d bytes to copy", blockBytes);
Print("BYTE_OFFSET", byteOffset, Print("BYTE_OFFSET", byteOffset,
"Using byte offset of %d", byteOffset); "Using byte offset of %d", byteOffset);
#if NIC_EXEC_ENABLED
Print("CLOSEST_NIC", (closestNicStr == "" ? "auto" : "user-input"),
"Per-GPU closest NIC is set as %s", (closestNicStr == "" ? "auto" : closestNicStr.c_str()));
#endif
Print("CU_MASK", getenv("CU_MASK") ? 1 : 0, Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
"%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All")); "%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0, Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
...@@ -359,11 +397,24 @@ public: ...@@ -359,11 +397,24 @@ public:
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" : gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" : gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll")); "CU,Wavefront,Unroll"));
#if NIC_EXEC_ENABLED
Print("IP_ADDRESS_FAMILY", ipAddressFamily,
"IP address family is set to IPv%d", ipAddressFamily);
Print("IB_GID_INDEX", ibGidIndex,
"RoCE GID index is set to %s", (ibGidIndex < 0 ? "auto" : std::to_string(ibGidIndex).c_str()));
Print("IB_PORT_NUMBER", ibPort,
"IB port number is set to %d", ibPort);
#endif
Print("MIN_VAR_SUBEXEC", minNumVarSubExec, Print("MIN_VAR_SUBEXEC", minNumVarSubExec,
"Using at least %d subexecutor(s) for variable subExec tranfers", minNumVarSubExec); "Using at least %d subexecutor(s) for variable subExec tranfers", minNumVarSubExec);
Print("MAX_VAR_SUBEXEC", maxNumVarSubExec, Print("MAX_VAR_SUBEXEC", maxNumVarSubExec,
"Using up to %s subexecutors for variable subExec transfers", "Using up to %s subexecutors for variable subExec transfers",
maxNumVarSubExec ? std::to_string(maxNumVarSubExec).c_str() : "all available"); maxNumVarSubExec ? std::to_string(maxNumVarSubExec).c_str() : "all available");
#if NIC_EXEC_ENABLED
Print("NIC_RELAX_ORDER", nicRelaxedOrder,
"Using %s ordering for NIC RDMA", nicRelaxedOrder ? "relaxed" : "strict");
#endif
Print("NUM_ITERATIONS", numIterations, Print("NUM_ITERATIONS", numIterations,
(numIterations == 0) ? "Running infinitely" : (numIterations == 0) ? "Running infinitely" :
"Running %d %s", abs(numIterations), (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test")); "Running %d %s", abs(numIterations), (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
...@@ -371,6 +422,10 @@ public: ...@@ -371,6 +422,10 @@ public:
"Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str()); "Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str());
Print("NUM_WARMUPS", numWarmups, Print("NUM_WARMUPS", numWarmups,
"Running %d warmup iteration(s) per Test", numWarmups); "Running %d warmup iteration(s) per Test", numWarmups);
#if NIC_EXEC_ENABLED
Print("ROCE_VERSION", roceVersion,
"RoCE version is set to %d", roceVersion);
#endif
Print("SHOW_ITERATIONS", showIterations, Print("SHOW_ITERATIONS", showIterations,
"%s per-iteration timing", showIterations ? "Showing" : "Hiding"); "%s per-iteration timing", showIterations ? "Showing" : "Hiding");
Print("USE_HIP_EVENTS", useHipEvents, Print("USE_HIP_EVENTS", useHipEvents,
...@@ -381,7 +436,6 @@ public: ...@@ -381,7 +436,6 @@ public:
"Running in %s mode", useInteractive ? "interactive" : "non-interactive"); "Running in %s mode", useInteractive ? "interactive" : "non-interactive");
Print("USE_SINGLE_STREAM", useSingleStream, Print("USE_SINGLE_STREAM", useSingleStream,
"Using single stream per GFX %s", useSingleStream ? "device" : "Transfer"); "Using single stream per GFX %s", useSingleStream ? "device" : "Transfer");
if (getenv("XCC_PREF_TABLE")) { if (getenv("XCC_PREF_TABLE")) {
printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", ""); printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
printf("%36s: ", ""); printf("%36s: ", "");
...@@ -479,6 +533,27 @@ public: ...@@ -479,6 +533,27 @@ public:
cfg.gfx.useSingleTeam = gfxSingleTeam; cfg.gfx.useSingleTeam = gfxSingleTeam;
cfg.gfx.waveOrder = gfxWaveOrder; cfg.gfx.waveOrder = gfxWaveOrder;
cfg.nic.ibGidIndex = ibGidIndex;
cfg.nic.ibPort = ibPort;
cfg.nic.ipAddressFamily = ipAddressFamily;
cfg.nic.useRelaxedOrder = nicRelaxedOrder;
cfg.nic.roceVersion = roceVersion;
std::vector<int> closestNics;
if(closestNicStr != "") {
std::stringstream ss(closestNicStr);
std::string item;
while (std::getline(ss, item, ',')) {
try {
int nic = std::stoi(item);
closestNics.push_back(nic);
} catch (const std::invalid_argument& e) {
printf("[ERROR] Invalid NIC index (%s) by user in %s\n", item.c_str(), closestNicStr.c_str());
exit(1);
}
}
cfg.nic.closestNics = closestNics;
}
return cfg; return cfg;
} }
}; };
......
...@@ -47,6 +47,7 @@ void AllToAllPreset(EnvVars& ev, ...@@ -47,6 +47,7 @@ void AllToAllPreset(EnvVars& ev,
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0); int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int a2aMode = EnvVars::GetEnvVar("A2A_MODE" , 0); int a2aMode = EnvVars::GetEnvVar("A2A_MODE" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8); int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8);
int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0); int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1); int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
...@@ -60,6 +61,7 @@ void AllToAllPreset(EnvVars& ev, ...@@ -60,6 +61,7 @@ void AllToAllPreset(EnvVars& ev,
ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude"); ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE" , a2aMode , a2aModeStr[a2aMode]); ev.Print("A2A_MODE" , a2aMode , a2aModeStr[a2aMode]);
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus); ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs); ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs);
ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX"); ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX");
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse"); ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
...@@ -114,6 +116,23 @@ void AllToAllPreset(EnvVars& ev, ...@@ -114,6 +116,23 @@ void AllToAllPreset(EnvVars& ev,
} }
} }
// Create a ring using NICs
std::vector<int> nicTransferIdx(numGpus);
if (numQueuePairs > 0) {
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
for (int i = 0; i < numGpus; i++) {
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({memType, i});
transfer.dsts.push_back({memType, (i+1) % numGpus});
transfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, i};
transfer.exeSubIndex = (i+1) % numGpus;
transfer.numSubExecs = numQueuePairs;
nicTransferIdx[i] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All benchmark:\n"); printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n"); printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n", printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
...@@ -138,15 +157,18 @@ void AllToAllPreset(EnvVars& ev, ...@@ -138,15 +157,18 @@ void AllToAllPreset(EnvVars& ev,
printf("SRC\\DST "); printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++) for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst); printf("%cGPU %02d ", separator, dst);
if (numQueuePairs > 0)
printf("%cNIC(%02d QP)", separator, numQueuePairs);
printf(" %cSTotal %cActual\n", separator, separator); printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0; double totalBandwidthGpu = 0.0;
double minExecutorBandwidth = std::numeric_limits<double>::max(); double minActualBandwidth = std::numeric_limits<double>::max();
double maxExecutorBandwidth = 0.0; double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+1, 0.0); std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
for (int src = 0; src < numGpus; src++) { for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0; double rowTotalBandwidth = 0;
double executorBandwidth = 0; int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src); printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) { for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) { if (reIndex.count(std::make_pair(src, dst))) {
...@@ -155,24 +177,38 @@ void AllToAllPreset(EnvVars& ev, ...@@ -155,24 +177,38 @@ void AllToAllPreset(EnvVars& ev,
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec; colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec; rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec; totalBandwidthGpu += r.avgBandwidthGbPerSec;
executorBandwidth = std::max(executorBandwidth, minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
results.exeResults[transfers[transferIdx].exeDevice].avgBandwidthGbPerSec); transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec); printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else { } else {
printf("%c%8s ", separator, "N/A"); printf("%c%8s ", separator, "N/A");
} }
} }
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth); if (numQueuePairs > 0) {
maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth); TransferBench::TransferResult const& r = results.tfrResults[nicTransferIdx[src]];
colTotalBandwidth[numGpus] += rowTotalBandwidth; colTotalBandwidth[numGpus] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
} }
printf("\nRTotal"); printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) { for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]); printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
} }
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus], if (numQueuePairs > 0) {
separator, minExecutorBandwidth, separator, maxExecutorBandwidth); printf("%c%8.3f ", separator, colTotalBandwidth[numGpus]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
separator, minActualBandwidth, separator, maxActualBandwidth);
printf("\n"); printf("\n");
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size()); printf("Average bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
......
...@@ -22,19 +22,21 @@ THE SOFTWARE. ...@@ -22,19 +22,21 @@ THE SOFTWARE.
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers) void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
{ {
fprintf(fp, "# Test %d\n", testNum); if (fp) {
fprintf(fp, "%d", -1 * (int)transfers.size()); fprintf(fp, "# Test %d\n", testNum);
for (auto const& transfer : transfers) fprintf(fp, "%d", -1 * (int)transfers.size());
{ for (auto const& transfer : transfers)
fprintf(fp, " (%s->%c%d->%s %d %lu)", {
MemDevicesToStr(transfer.srcs).c_str(), fprintf(fp, " (%s->%c%d->%s %d %lu)",
ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex, MemDevicesToStr(transfer.srcs).c_str(),
MemDevicesToStr(transfer.dsts).c_str(), ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex,
transfer.numSubExecs, MemDevicesToStr(transfer.dsts).c_str(),
transfer.numBytes); transfer.numSubExecs,
transfer.numBytes);
}
fprintf(fp, "\n");
fflush(fp);
} }
fprintf(fp, "\n");
fflush(fp);
} }
void SweepPreset(EnvVars& ev, void SweepPreset(EnvVars& ev,
...@@ -54,6 +56,7 @@ void SweepPreset(EnvVars& ev, ...@@ -54,6 +56,7 @@ void SweepPreset(EnvVars& ev,
int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE" , 4); int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE" , 4);
std::string sweepDst = EnvVars::GetEnvVar("SWEEP_DST" , "CG"); std::string sweepDst = EnvVars::GetEnvVar("SWEEP_DST" , "CG");
std::string sweepExe = EnvVars::GetEnvVar("SWEEP_EXE" , "CDG"); std::string sweepExe = EnvVars::GetEnvVar("SWEEP_EXE" , "CDG");
std::string sweepFile = EnvVars::GetEnvVar("SWEEP_FILE" , "/tmp/lastSweep.cfg");
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX" , 24); int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX" , 24);
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN" , 1); int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN" , 1);
int sweepRandBytes = EnvVars::GetEnvVar("SWEEP_RAND_BYTES" , 0); int sweepRandBytes = EnvVars::GetEnvVar("SWEEP_RAND_BYTES" , 0);
...@@ -78,6 +81,7 @@ void SweepPreset(EnvVars& ev, ...@@ -78,6 +81,7 @@ void SweepPreset(EnvVars& ev,
ev.Print("NUM_GPU_SE", numGpuSubExecs, "Using %d subExecutors/CUs per GPU executed Transfer", numGpuSubExecs); ev.Print("NUM_GPU_SE", numGpuSubExecs, "Using %d subExecutors/CUs per GPU executed Transfer", numGpuSubExecs);
ev.Print("SWEEP_DST", sweepDst.c_str(), "Destination Memory Types to sweep"); ev.Print("SWEEP_DST", sweepDst.c_str(), "Destination Memory Types to sweep");
ev.Print("SWEEP_EXE", sweepExe.c_str(), "Executor Types to sweep"); ev.Print("SWEEP_EXE", sweepExe.c_str(), "Executor Types to sweep");
ev.Print("SWEEP_FILE", sweepFile.c_str(),"File to store the executing sweep configuration");
ev.Print("SWEEP_MAX", sweepMax, "Max simultaneous transfers (0 = no limit)"); ev.Print("SWEEP_MAX", sweepMax, "Max simultaneous transfers (0 = no limit)");
ev.Print("SWEEP_MIN", sweepMin, "Min simultaenous transfers"); ev.Print("SWEEP_MIN", sweepMin, "Min simultaenous transfers");
ev.Print("SWEEP_RAND_BYTES", sweepRandBytes, "Using %s number of bytes per Transfer", (sweepRandBytes ? "random" : "constant")); ev.Print("SWEEP_RAND_BYTES", sweepRandBytes, "Using %s number of bytes per Transfer", (sweepRandBytes ? "random" : "constant"));
...@@ -283,10 +287,14 @@ void SweepPreset(EnvVars& ev, ...@@ -283,10 +287,14 @@ void SweepPreset(EnvVars& ev,
std::uniform_int_distribution<int> distribution(sweepMin, maxParallelTransfers); std::uniform_int_distribution<int> distribution(sweepMin, maxParallelTransfers);
// Log sweep to configuration file // Log sweep to configuration file
FILE *fp = fopen("lastSweep.cfg", "w"); char absPath[1024];
auto const res = realpath(sweepFile.c_str(), absPath);
FILE *fp = fopen(sweepFile.c_str(), "w");
if (!fp) { if (!fp) {
printf("[ERROR] Unable to open lastSweep.cfg. Check permissions\n"); printf("[WARN] Unable to open %s. Skipping output of sweep configuration file\n", res ? absPath : sweepFile.c_str());
exit(1); } else {
printf("Sweep configuration saved to: %s\n", res ? absPath : sweepFile.c_str());
} }
// Create bitmask of numPossible triplets, of which M will be chosen // Create bitmask of numPossible triplets, of which M will be chosen
...@@ -333,7 +341,7 @@ void SweepPreset(EnvVars& ev, ...@@ -333,7 +341,7 @@ void SweepPreset(EnvVars& ev,
// Check for test limit // Check for test limit
if (numTestsRun == sweepTestLimit) { if (numTestsRun == sweepTestLimit) {
printf("Test limit reached\n"); printf("Sweep Test limit reached\n");
break; break;
} }
...@@ -341,7 +349,7 @@ void SweepPreset(EnvVars& ev, ...@@ -341,7 +349,7 @@ void SweepPreset(EnvVars& ev,
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count(); double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
if (sweepTimeLimit && totalCpuTime > sweepTimeLimit) { if (sweepTimeLimit && totalCpuTime > sweepTimeLimit) {
printf("Time limit exceeded\n"); printf("Sweep Time limit exceeded\n");
break; break;
} }
...@@ -357,5 +365,5 @@ void SweepPreset(EnvVars& ev, ...@@ -357,5 +365,5 @@ void SweepPreset(EnvVars& ev,
bitmask[i] = (i < M) ? 1 : 0; bitmask[i] = (i < M) ? 1 : 0;
} }
} }
fclose(fp); if (fp) fclose(fp);
} }
...@@ -38,21 +38,53 @@ static int RemappedCpuIndex(int origIdx) ...@@ -38,21 +38,53 @@ static int RemappedCpuIndex(int origIdx)
return remappingCpu[origIdx]; return remappingCpu[origIdx];
} }
static void PrintNicToGPUTopo(bool outputToCsv)
{
#ifdef NIC_EXEC_ENABLED
printf(" NIC | Device Name | Active | PCIe Bus ID | NUMA | Closest GPU(s)\n");
if(!outputToCsv)
printf("-----+-------------+--------+--------------+------+---------------\n");
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
auto const& ibvDeviceList = GetIbvDeviceList();
for (int i = 0; i < ibvDeviceList.size(); i++) {
std::string closestGpusStr = "";
for (int j = 0; j < numGpus; j++) {
if (TransferBench::GetClosestNicToGpu(j) == i) {
if (closestGpusStr != "") closestGpusStr += ",";
closestGpusStr += std::to_string(j);
}
}
printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-20s\n",
i, ibvDeviceList[i].name.c_str(),
ibvDeviceList[i].hasActivePort ? "Yes" : "No",
ibvDeviceList[i].busId.c_str(),
ibvDeviceList[i].numaNode,
closestGpusStr.c_str());
}
printf("\n");
#endif
}
void DisplayTopology(bool outputToCsv) void DisplayTopology(bool outputToCsv)
{ {
int numCpus = TransferBench::GetNumExecutors(EXE_CPU); int numCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
char sep = (outputToCsv ? ',' : '|'); char sep = (outputToCsv ? ',' : '|');
if (outputToCsv) { if (outputToCsv) {
printf("NumCpus,%d\n", numCpus); printf("NumCpus,%d\n", numCpus);
printf("NumGpus,%d\n", numGpus); printf("NumGpus,%d\n", numGpus);
printf("NumNics,%d\n", numNics);
} else { } else {
printf("\nDetected Topology:\n"); printf("\nDetected Topology:\n");
printf("==================\n"); printf("==================\n");
printf(" %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1); printf(" %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1);
printf(" %d GPU device(s)\n", numGpus); printf(" %d GPU device(s)\n", numGpus);
printf(" %d Supported NIC device(s)\n", numNics);
} }
// Print out detected CPU topology // Print out detected CPU topology
...@@ -91,8 +123,10 @@ void DisplayTopology(bool outputToCsv) ...@@ -91,8 +123,10 @@ void DisplayTopology(bool outputToCsv)
} }
printf("\n"); printf("\n");
// Print out detected GPU topology // Print out detected NIC topology
PrintNicToGPUTopo(outputToCsv);
// Print out detected GPU topology
#if defined(__NVCC__) #if defined(__NVCC__)
for (int i = 0; i < numGpus; i++) { for (int i = 0; i < numGpus; i++) {
hipDeviceProp_t prop; hipDeviceProp_t prop;
...@@ -118,12 +152,12 @@ void DisplayTopology(bool outputToCsv) ...@@ -118,12 +152,12 @@ void DisplayTopology(bool outputToCsv)
printf(" %c", sep); printf(" %c", sep);
for (int j = 0; j < numGpus; j++) for (int j = 0; j < numGpus; j++)
printf(" GPU %02d %c", j, sep); printf(" GPU %02d %c", j, sep);
printf(" PCIe Bus ID %c #CUs %c NUMA %c #DMA %c #XCC\n", sep, sep, sep, sep); printf(" PCIe Bus ID %c #CUs %c NUMA %c #DMA %c #XCC %c NIC\n", sep, sep, sep, sep, sep);
if (!outputToCsv) { if (!outputToCsv) {
for (int j = 0; j <= numGpus; j++) for (int j = 0; j <= numGpus; j++)
printf("--------+"); printf("--------+");
printf("--------------+------+------+------+------\n"); printf("--------------+------+------+------+------+------\n");
} }
// Loop over each GPU device // Loop over each GPU device
...@@ -149,12 +183,13 @@ void DisplayTopology(bool outputToCsv) ...@@ -149,12 +183,13 @@ void DisplayTopology(bool outputToCsv)
char pciBusId[20]; char pciBusId[20];
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i)); HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
printf(" %11s %c %4d %c %4d %c %4d %c %4d\n", printf(" %-11s %c %-4d %c %-4d %c %-4d %c %-4d %c %-4d\n",
pciBusId, sep, pciBusId, sep,
TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}), sep, TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}), sep,
TransferBench::GetClosestCpuNumaToGpu(i), sep, TransferBench::GetClosestCpuNumaToGpu(i), sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}), sep, TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}), sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i})); TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}), sep,
TransferBench::GetClosestNicToGpu(i));
} }
#endif #endif
} }
...@@ -34,6 +34,19 @@ THE SOFTWARE. ...@@ -34,6 +34,19 @@ THE SOFTWARE.
#include <unistd.h> #include <unistd.h>
#include <vector> #include <vector>
#ifdef NIC_EXEC_ENABLED
#include <infiniband/verbs.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <unistd.h>
#include <filesystem>
#include <fstream>
#endif
#if defined(__NVCC__) #if defined(__NVCC__)
#include <cuda_runtime.h> #include <cuda_runtime.h>
#else #else
...@@ -51,7 +64,7 @@ namespace TransferBench ...@@ -51,7 +64,7 @@ namespace TransferBench
using std::set; using std::set;
using std::vector; using std::vector;
constexpr char VERSION[] = "1.58"; constexpr char VERSION[] = "1.59";
/** /**
* Enumeration of supported Executor types * Enumeration of supported Executor types
...@@ -64,11 +77,13 @@ namespace TransferBench ...@@ -64,11 +77,13 @@ namespace TransferBench
EXE_CPU = 0, ///< CPU executor (subExecutor = CPU thread) EXE_CPU = 0, ///< CPU executor (subExecutor = CPU thread)
EXE_GPU_GFX = 1, ///< GPU kernel-based executor (subExecutor = threadblock/CU) EXE_GPU_GFX = 1, ///< GPU kernel-based executor (subExecutor = threadblock/CU)
EXE_GPU_DMA = 2, ///< GPU SDMA executor (subExecutor = not supported) EXE_GPU_DMA = 2, ///< GPU SDMA executor (subExecutor = not supported)
EXE_IBV = 3, ///< IBVerbs executor (subExecutor = queue pair) EXE_NIC = 3, ///< NIC RDMA executor (subExecutor = queue pair)
EXE_NIC_NEAREST = 4 ///< NIC RDMA nearest executor (subExecutor = queue pair)
}; };
char const ExeTypeStr[5] = "CGDI"; char const ExeTypeStr[6] = "CGDIN";
inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; } inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; }
inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA; } inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA; }
inline bool IsNicExeType(ExeType e){ return e == EXE_NIC || e == EXE_NIC_NEAREST; }
/** /**
* A ExeDevice defines a specific Executor * A ExeDevice defines a specific Executor
...@@ -120,11 +135,10 @@ namespace TransferBench ...@@ -120,11 +135,10 @@ namespace TransferBench
*/ */
struct Transfer struct Transfer
{ {
size_t numBytes = (1<<26); ///< Number of bytes to Transfer size_t numBytes = 0; ///< Number of bytes to Transfer
vector<MemDevice> srcs = {}; ///< List of source memory devices vector<MemDevice> srcs = {}; ///< List of source memory devices
vector<MemDevice> dsts = {}; ///< List of destination memory devices vector<MemDevice> dsts = {}; ///< List of destination memory devices
ExeDevice exeDevice = {}; ///< Executor to use ExeDevice exeDevice = {}; ///< Executor to use
int32_t exeDstIndex = -1; ///< Destination executor index (for RDMA executor only)
int32_t exeSubIndex = -1; ///< Executor subindex int32_t exeSubIndex = -1; ///< Executor subindex
int numSubExecs = 0; ///< Number of subExecutors to use for this Transfer int numSubExecs = 0; ///< Number of subExecutors to use for this Transfer
}; };
...@@ -154,15 +168,6 @@ namespace TransferBench ...@@ -154,15 +168,6 @@ namespace TransferBench
int validateSource = 0; ///< Validate src GPU memory immediately after preparation int validateSource = 0; ///< Validate src GPU memory immediately after preparation
}; };
/**
* DMA Executor options
*/
struct DmaOptions
{
int useHipEvents = 1; ///< Use HIP events for timing DMA Executor
int useHsaCopy = 0; ///< Use HSA copy instead of HIP copy to perform DMA
};
/** /**
* GFX Executor options * GFX Executor options
*/ */
...@@ -178,6 +183,33 @@ namespace TransferBench ...@@ -178,6 +183,33 @@ namespace TransferBench
int waveOrder = 0; ///< GFX-kernel wavefront ordering int waveOrder = 0; ///< GFX-kernel wavefront ordering
}; };
/**
* DMA Executor options
*/
struct DmaOptions
{
int useHipEvents = 1; ///< Use HIP events for timing DMA Executor
int useHsaCopy = 0; ///< Use HSA copy instead of HIP copy to perform DMA
};
/**
* NIC Executor options
*/
struct NicOptions
{
vector<int> closestNics = {}; ///< Overrides the auto-detected closest NIC per GPU
int ibGidIndex = -1; ///< GID Index for RoCE NICs (-1 is auto)
uint8_t ibPort = 1; ///< NIC port number to be used
int ipAddressFamily = 4; ///< 4=IPv4, 6=IPv6 (used for auto GID detection)
int maxRecvWorkReq = 16; ///< Maximum number of recv work requests per queue pair
int maxSendWorkReq = 16; ///< Maximum number of send work requests per queue pair
int queueSize = 100; ///< Completion queue size
int roceVersion = 2; ///< RoCE version (used for auto GID detection)
int useRelaxedOrder = 1; ///< Use relaxed ordering
int useNuma = 0; ///< Switch to closest numa thread for execution
};
/** /**
* Configuration options for performing Transfers * Configuration options for performing Transfers
*/ */
...@@ -188,6 +220,7 @@ namespace TransferBench ...@@ -188,6 +220,7 @@ namespace TransferBench
GfxOptions gfx; ///< GFX executor options GfxOptions gfx; ///< GFX executor options
DmaOptions dma; ///< DMA executor options DmaOptions dma; ///< DMA executor options
NicOptions nic; ///< NIC executor options
}; };
/** /**
...@@ -243,6 +276,9 @@ namespace TransferBench ...@@ -243,6 +276,9 @@ namespace TransferBench
// Only filled in if recordPerIteration = 1 // Only filled in if recordPerIteration = 1
vector<double> perIterMsec; ///< Duration for each individual iteration vector<double> perIterMsec; ///< Duration for each individual iteration
vector<set<pair<int,int>>> perIterCUs; ///< GFX-Executor only. XCC:CU used per iteration vector<set<pair<int,int>>> perIterCUs; ///< GFX-Executor only. XCC:CU used per iteration
ExeDevice exeDevice; ///< Tracks which executor performed this Transfer (e.g. for EXE_NIC_NEAREST)
ExeDevice exeDstDevice; ///< Tracks actual destination executor (only valid for EXE_NIC/EXE_NIC_NEAREST)
}; };
/** /**
...@@ -344,6 +380,23 @@ namespace TransferBench ...@@ -344,6 +380,23 @@ namespace TransferBench
*/ */
int GetClosestCpuNumaToGpu(int gpuIndex); int GetClosestCpuNumaToGpu(int gpuIndex);
/**
* Returns the index of the NUMA node closest to the given NIC
*
* @param[in] nicIndex Index of the NIC to query
* @returns NUMA node index closest to the NIC nicIndex, or -1 if unable to detect
*/
int GetClosestCpuNumaToNic(int nicIndex);
/**
* Returns the index of the NIC closest to the given GPU
*
* @param[in] gpuIndex Index of the GPU to query
* @note This function is applicable when the IBV/RDMA executor is available
* @returns IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect
*/
int GetClosestNicToGpu(int gpuIndex);
/** /**
* Helper function to parse a line containing Transfers into a vector of Transfers * Helper function to parse a line containing Transfers into a vector of Transfers
* *
...@@ -353,7 +406,6 @@ namespace TransferBench ...@@ -353,7 +406,6 @@ namespace TransferBench
*/ */
ErrResult ParseTransfers(std::string str, ErrResult ParseTransfers(std::string str,
std::vector<Transfer>& transfers); std::vector<Transfer>& transfers);
}; };
//========================================================================================== //==========================================================================================
// End of TransferBench API // End of TransferBench API
...@@ -435,7 +487,7 @@ namespace TransferBench ...@@ -435,7 +487,7 @@ namespace TransferBench
#endif #endif
// Macro for collecting XCC GFX kernel is running on // Macro for collecting XCC GFX kernel is running on
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val)); #define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
#else #else
#define GetXccId(val) val = 0 #define GetXccId(val) val = 0
...@@ -459,6 +511,45 @@ namespace TransferBench ...@@ -459,6 +511,45 @@ namespace TransferBench
return false; \ return false; \
} while (0) } while (0)
// Helper macros for calling RDMA functions and reporting errors
#ifdef VERBS_DEBUG
#define IBV_CALL(__func__, ...) \
do { \
int error = __func__(__VA_ARGS__); \
if (error != 0) { \
return {ERR_FATAL, "Encountered IbVerbs error (%d) at line (%d) " \
"and function (%s)", (error), __LINE__, #__func__}; \
} \
} while (0)
#define IBV_PTR_CALL(__ptr__, __func__, ...) \
do { \
__ptr__ = __func__(__VA_ARGS__); \
if (__ptr__ == nullptr) { \
return {ERR_FATAL, "Encountered IbVerbs nullptr error at line (%d) " \
"and function (%s)", __LINE__, #__func__}; \
} \
} while (0)
#else
#define IBV_CALL(__func__, ...) \
do { \
int error = __func__(__VA_ARGS__); \
if (error != 0) { \
return {ERR_FATAL, "Encountered IbVerbs error (%d) in func (%s) " \
, error, #__func__}; \
} \
} while (0)
#define IBV_PTR_CALL(__ptr__, __func__, ...) \
do { \
__ptr__ = __func__(__VA_ARGS__); \
if (__ptr__ == nullptr) { \
return {ERR_FATAL, "Encountered IbVerbs nullptr error in func (%s) " \
, #__func__}; \
} \
} while (0)
#endif
namespace TransferBench namespace TransferBench
{ {
...@@ -469,13 +560,14 @@ namespace { ...@@ -469,13 +560,14 @@ namespace {
// Constants // Constants
//======================================================================================== //========================================================================================
int constexpr MAX_BLOCKSIZE = 512; // Max threadblock size
int constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64; // Max wavegroups/warps int constexpr MAX_BLOCKSIZE = 512; // Max threadblock size
int constexpr MAX_UNROLL = 8; // Max unroll factor int constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64; // Max wavegroups/warps
int constexpr MAX_SRCS = 8; // Max number of srcs per Transfer int constexpr MAX_UNROLL = 8; // Max unroll factor
int constexpr MAX_DSTS = 8; // Max number of dsts per Transfer int constexpr MAX_SRCS = 8; // Max srcs per Transfer
int constexpr MEMSET_CHAR = 75; // Value to memset (char) int constexpr MAX_DSTS = 8; // Max dsts per Transfer
float constexpr MEMSET_VAL = 13323083.0f; // Value to memset (double) int constexpr MEMSET_CHAR = 75; // Value to memset (char)
float constexpr MEMSET_VAL = 13323083.0f; // Value to memset (double)
// Parsing-related functions // Parsing-related functions
//======================================================================================== //========================================================================================
...@@ -594,7 +686,7 @@ namespace { ...@@ -594,7 +686,7 @@ namespace {
if (mistakeCount > 0) { if (mistakeCount > 0) {
return {ERR_FATAL, return {ERR_FATAL,
"%lu out of %lu pages for memory allocation were not on NUMA node %d." "%lu out of %lu pages for memory allocation were not on NUMA node %d."
" This could be due to hardware memory issues", " This could be due to hardware memory issues, or the use of numa-rebalancing daemons such as numad",
mistakeCount, numPages, targetId}; mistakeCount, numPages, targetId};
} }
return ERR_NONE; return ERR_NONE;
...@@ -744,6 +836,7 @@ namespace { ...@@ -744,6 +836,7 @@ namespace {
break; break;
case EXE_GPU_GFX: case EXE_GPU_DMA: case EXE_GPU_GFX: case EXE_GPU_DMA:
if (exeIndex < 0 || exeIndex >= numGpus) if (exeIndex < 0 || exeIndex >= numGpus)
return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1}; return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
agent = gpuAgents[exeIndex]; agent = gpuAgents[exeIndex];
break; break;
...@@ -769,6 +862,29 @@ namespace { ...@@ -769,6 +862,29 @@ namespace {
// Setup validation-related functions // Setup validation-related functions
//======================================================================================== //========================================================================================
static ErrResult GetActualExecutor(ConfigOptions const& cfg,
ExeDevice const& origExeDevice,
ExeDevice& actualExeDevice)
{
// By default, nothing needs to change
actualExeDevice = origExeDevice;
// When using NIC_NEAREST, remap to the closest NIC to the GPU
if (origExeDevice.exeType == EXE_NIC_NEAREST) {
actualExeDevice.exeType = EXE_NIC;
if (cfg.nic.closestNics.size() > 0) {
if (origExeDevice.exeIndex < 0 || origExeDevice.exeIndex >= cfg.nic.closestNics.size())
return {ERR_FATAL, "NIC index is out of range (%d)", origExeDevice.exeIndex};
actualExeDevice.exeIndex = cfg.nic.closestNics[origExeDevice.exeIndex];
} else {
actualExeDevice.exeIndex = GetClosestNicToGpu(origExeDevice.exeIndex);
}
}
return ERR_NONE;
}
// Validate that MemDevice exists // Validate that MemDevice exists
static ErrResult CheckMemDevice(MemDevice const& memDevice) static ErrResult CheckMemDevice(MemDevice const& memDevice)
{ {
...@@ -848,6 +964,20 @@ namespace { ...@@ -848,6 +964,20 @@ namespace {
} }
} }
// Check NIC options
#ifdef NIC_EXEC_ENABLED
int numNics = GetNumExecutors(EXE_NIC);
for (auto const& nic : cfg.nic.closestNics)
if (nic < 0 || nic >= numNics)
errors.push_back({ERR_FATAL, "NIC index (%d) in user-specified closest NIC list must be between 0 and %d",
nic, numNics - 1});
size_t closetNicsSize = cfg.nic.closestNics.size();
if (closetNicsSize > 0 && closetNicsSize < numGpus)
errors.push_back({ERR_FATAL, "User-specified closest NIC list must match GPU count of %d",
numGpus});
#endif
// NVIDIA specific // NVIDIA specific
#if defined(__NVCC__) #if defined(__NVCC__)
if (cfg.data.validateDirect) if (cfg.data.validateDirect)
...@@ -881,6 +1011,7 @@ namespace { ...@@ -881,6 +1011,7 @@ namespace {
{ {
int numCpus = GetNumExecutors(EXE_CPU); int numCpus = GetNumExecutors(EXE_CPU);
int numGpus = GetNumExecutors(EXE_GPU_GFX); int numGpus = GetNumExecutors(EXE_GPU_GFX);
int numNics = GetNumExecutors(EXE_NIC);
std::set<ExeDevice> executors; std::set<ExeDevice> executors;
std::map<ExeDevice, int> transferCount; std::map<ExeDevice, int> transferCount;
...@@ -1025,8 +1156,33 @@ namespace { ...@@ -1025,8 +1156,33 @@ namespace {
} }
} }
break; break;
case EXE_IBV: case EXE_NIC:
errors.push_back({ERR_FATAL, "Transfer %d: IBV executor currently not supported", i}); #ifdef NIC_EXEC_ENABLED
{
int srcIndex = t.exeDevice.exeIndex;
int dstIndex = t.exeSubIndex;
if (srcIndex < 0 || srcIndex >= numNics)
errors.push_back({ERR_FATAL, "Transfer %d: src NIC executor indexes an out-of-range NIC (%d)", i, srcIndex});
if (dstIndex < 0 || dstIndex >= numNics)
errors.push_back({ERR_FATAL, "Transfer %d: dst NIC executor indexes an out-of-range NIC (%d)", i, dstIndex});
}
#else
errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i});
#endif
break;
case EXE_NIC_NEAREST:
#ifdef NIC_EXEC_ENABLED
{
ExeDevice srcExeDevice;
ErrResult errSrc = GetActualExecutor(cfg, t.exeDevice, srcExeDevice);
if (errSrc.errType != ERR_NONE) errors.push_back(errSrc);
ExeDevice dstExeDevice;
ErrResult errDst = GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice);
if (errDst.errType != ERR_NONE) errors.push_back(errDst);
}
#else
errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i});
#endif
break; break;
} }
...@@ -1106,7 +1262,6 @@ namespace { ...@@ -1106,7 +1262,6 @@ namespace {
} }
} }
// Check for fatal errors // Check for fatal errors
for (auto const& err : errors) for (auto const& err : errors)
if (err.errType == ERR_FATAL) return true; if (err.errType == ERR_FATAL) return true;
...@@ -1147,6 +1302,7 @@ namespace { ...@@ -1147,6 +1302,7 @@ namespace {
vector<float*> dstMem; ///< Destination memory vector<float*> dstMem; ///< Destination memory
vector<SubExecParam> subExecParamCpu; ///< Defines subarrays for each subexecutor vector<SubExecParam> subExecParamCpu; ///< Defines subarrays for each subexecutor
vector<int> subExecIdx; ///< Indices into subExecParamGpu vector<int> subExecIdx; ///< Indices into subExecParamGpu
int numaNode; ///< NUMA node to use for this Transfer
// For GFX executor // For GFX executor
SubExecParam* subExecParamGpuPtr; SubExecParam* subExecParamGpuPtr;
...@@ -1159,6 +1315,29 @@ namespace { ...@@ -1159,6 +1315,29 @@ namespace {
hsa_amd_sdma_engine_id_t sdmaEngineId; ///< DMA engine ID hsa_amd_sdma_engine_id_t sdmaEngineId; ///< DMA engine ID
#endif #endif
// For IBV executor
#ifdef NIC_EXEC_ENABLED
int srcNicIndex; ///< SRC NIC index
int dstNicIndex; ///< DST NIC index
ibv_context* srcContext; ///< Device context for SRC NIC
ibv_context* dstContext; ///< Device context for DST NIC
ibv_pd* srcProtect; ///< Protection domain for SRC NIC
ibv_pd* dstProtect; ///< Protection domain for DST NIC
ibv_cq* srcCompQueue; ///< Completion queue for SRC NIC
ibv_cq* dstCompQueue; ///< Completion queue for DST NIC
ibv_port_attr srcPortAttr; ///< Port attributes for SRC NIC
ibv_port_attr dstPortAttr; ///< Port attributes for DST NIC
ibv_gid srcGid; ///< GID handle for SRC NIC
ibv_gid dstGid; ///< GID handle for DST NIC
vector<ibv_qp*> srcQueuePairs; ///< Queue pairs for SRC NIC
vector<ibv_qp*> dstQueuePairs; ///< Queue pairs for DST NIC
ibv_mr* srcMemRegion; ///< Memory region for SRC
ibv_mr* dstMemRegion; ///< Memory region for DST
uint8_t qpCount; ///< Number of QPs to be used for transferring data
vector<ibv_sge> sgePerQueuePair; ///< Scatter-gather elements per queue pair
vector<ibv_send_wr> sendWorkRequests; ///< Send work requests per queue pair
#endif
// Counters // Counters
double totalDurationMsec; ///< Total duration for all iterations for this Transfer double totalDurationMsec; ///< Total duration for all iterations for this Transfer
vector<double> perIterMsec; ///< Duration for each individual iteration vector<double> perIterMsec; ///< Duration for each individual iteration
...@@ -1173,7 +1352,6 @@ namespace { ...@@ -1173,7 +1352,6 @@ namespace {
int totalSubExecs; ///< Total number of subExecutors to use int totalSubExecs; ///< Total number of subExecutors to use
bool useSubIndices; ///< Use subexecutor indicies bool useSubIndices; ///< Use subexecutor indicies
int numSubIndices; ///< Number of subindices this ExeDevice has int numSubIndices; ///< Number of subindices this ExeDevice has
int wallClockRate; ///< (GFX-only) Device wall clock rate
vector<SubExecParam> subExecParamCpu; ///< Subexecutor parameters for this executor vector<SubExecParam> subExecParamCpu; ///< Subexecutor parameters for this executor
vector<TransferResources> resources; ///< Per-Transfer resources vector<TransferResources> resources; ///< Per-Transfer resources
...@@ -1182,8 +1360,750 @@ namespace { ...@@ -1182,8 +1360,750 @@ namespace {
vector<hipStream_t> streams; ///< HIP streams to launch on vector<hipStream_t> streams; ///< HIP streams to launch on
vector<hipEvent_t> startEvents; ///< HIP start timing event vector<hipEvent_t> startEvents; ///< HIP start timing event
vector<hipEvent_t> stopEvents; ///< HIP stop timing event vector<hipEvent_t> stopEvents; ///< HIP stop timing event
int wallClockRate; ///< (GFX-only) Device wall clock rate
}; };
// Structure to track PCIe topology
struct PCIeNode
{
std::string address; ///< PCIe address for this PCIe node
std::string description; ///< Description for this PCIe node
std::set<PCIeNode> children; ///< Children PCIe nodes
// Default constructor
PCIeNode() : address(""), description("") {}
// Constructor
PCIeNode(std::string const& addr) : address(addr) {}
// Constructor
PCIeNode(std::string const& addr, std::string const& desc)
:address(addr), description(desc) {}
// Comparison operator for std::set
bool operator<(PCIeNode const& other) const {
return address < other.address;
}
};
#ifdef NIC_EXEC_ENABLED
// Structure to track information about IBV devices
struct IbvDevice
{
ibv_device* devicePtr;
std::string name;
std::string busId;
bool hasActivePort;
int numaNode;
};
#endif
#ifdef NIC_EXEC_ENABLED
// Function to collect information about IBV devices
//========================================================================================
static vector<IbvDevice>& GetIbvDeviceList()
{
static bool isInitialized = false;
static vector<IbvDevice> ibvDeviceList = {};
// Build list on first use
if (!isInitialized) {
// Query the number of IBV devices
int numIbvDevices = 0;
ibv_device** deviceList = ibv_get_device_list(&numIbvDevices);
if (deviceList && numIbvDevices > 0) {
// Loop over each device to collect information
for (int i = 0; i < numIbvDevices; i++) {
IbvDevice ibvDevice;
ibvDevice.devicePtr = deviceList[i];
ibvDevice.name = deviceList[i]->name;
ibvDevice.hasActivePort = false;
{
struct ibv_context *context = ibv_open_device(ibvDevice.devicePtr);
if (context) {
struct ibv_device_attr deviceAttr;
if (!ibv_query_device(context, &deviceAttr)) {
for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) {
struct ibv_port_attr portAttr;
if (ibv_query_port(context, port, &portAttr)) continue;
if (portAttr.state == IBV_PORT_ACTIVE)
ibvDevice.hasActivePort = true;
break;
}
}
ibv_close_device(context);
}
}
ibvDevice.busId = "";
{
std::string device_path(ibvDevice.devicePtr->dev_path);
if (std::filesystem::exists(device_path)) {
std::string pciPath = std::filesystem::canonical(device_path + "/device").string();
std::size_t pos = pciPath.find_last_of('/');
if (pos != std::string::npos) {
ibvDevice.busId = pciPath.substr(pos + 1);
}
}
}
// Get nearest numa node for this device
ibvDevice.numaNode = -1;
std::filesystem::path devicePath = "/sys/bus/pci/devices/" + ibvDevice.busId + "/numa_node";
std::string canonicalPath = std::filesystem::canonical(devicePath).string();
if (std::filesystem::exists(canonicalPath)) {
std::ifstream file(canonicalPath);
if (file.is_open()) {
std::string numaNodeStr;
std::getline(file, numaNodeStr);
int numaNodeVal;
if (sscanf(numaNodeStr.c_str(), "%d", &numaNodeVal) == 1)
ibvDevice.numaNode = numaNodeVal;
file.close();
}
}
ibvDeviceList.push_back(ibvDevice);
}
}
ibv_free_device_list(deviceList);
isInitialized = true;
}
return ibvDeviceList;
}
#endif // NIC_EXEC_ENABLED
#ifdef NIC_EXEC_ENABLED
// PCIe-related functions
//========================================================================================
// Prints off PCIe tree
static void PrintPCIeTree(PCIeNode const& node,
std::string const& prefix = "",
bool isLast = true)
{
if (!node.address.empty()) {
printf("%s%s%s", prefix.c_str(), (isLast ? "└── " : "├── "), node.address.c_str());
if (!node.description.empty()) {
printf("(%s)", node.description.c_str());
}
printf("\n");
}
auto const& children = node.children;
for (auto it = children.begin(); it != children.end(); ++it) {
PrintPCIeTree(*it, prefix + (isLast ? " " : "│ "), std::next(it) == children.end());
}
}
// Inserts nodes along pcieAddress down a tree starting from root
static ErrResult InsertPCIePathToTree(std::string const& pcieAddress,
std::string const& description,
PCIeNode& root)
{
std::filesystem::path devicePath = "/sys/bus/pci/devices/" + pcieAddress;
std::string canonicalPath = std::filesystem::canonical(devicePath).string();
if (!std::filesystem::exists(devicePath)) {
return {ERR_FATAL, "Device path %s does not exist", devicePath.c_str()};
}
std::istringstream iss(canonicalPath);
std::string token;
PCIeNode* currNode = &root;
while (std::getline(iss, token, '/')) {
auto it = (currNode->children.insert(PCIeNode(token))).first;
currNode = const_cast<PCIeNode*>(&(*it));
}
currNode->description = description;
return ERR_NONE;
}
// Returns root node for PCIe tree. Constructed on first use
static PCIeNode* GetPCIeTreeRoot()
{
static bool isInitialized = false;
static PCIeNode pcieRoot;
// Build PCIe tree on first use
if (!isInitialized) {
// Add NICs to the tree
int numNics = GetNumExecutors(EXE_NIC);
auto const& ibvDeviceList = GetIbvDeviceList();
for (IbvDevice const& ibvDevice : ibvDeviceList) {
if (!ibvDevice.hasActivePort || ibvDevice.busId == "") continue;
InsertPCIePathToTree(ibvDevice.busId, ibvDevice.name, pcieRoot);
}
// Add GPUs to the tree
int numGpus = GetNumExecutors(EXE_GPU_GFX);
for (int i = 0; i < numGpus; ++i) {
char hipPciBusId[64];
if (hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i) == hipSuccess) {
InsertPCIePathToTree(hipPciBusId, "GPU " + std::to_string(i), pcieRoot);
}
}
#ifdef VERBS_DEBUG
PrintPCIeTree(pcieRoot);
#endif
isInitialized = true;
}
return &pcieRoot;
}
// Finds the lowest common ancestor in PCIe tree between two nodes
static PCIeNode const* GetLcaBetweenNodes(PCIeNode const* root,
std::string const& node1Address,
std::string const& node2Address)
{
if (!root || root->address == node1Address || root->address == node2Address)
return root;
PCIeNode const* lcaFound1 = nullptr;
PCIeNode const* lcaFound2 = nullptr;
// Recursively iterate over children
for (auto const& child : root->children) {
PCIeNode const* lca = GetLcaBetweenNodes(&child, node1Address, node2Address);
if (!lca) continue;
if (!lcaFound1) {
// First time found
lcaFound1 = lca;
} else {
// Second time found
lcaFound2 = lca;
break;
}
}
// If two children were found, then current node is the lowest common ancestor
return (lcaFound1 && lcaFound2) ? root : lcaFound1;
}
// Gets the depth of an node in the PCIe tree
static int GetLcaDepth(std::string const& targetBusID,
PCIeNode const* const& node,
int depth = 0)
{
if (!node) return -1;
if (targetBusID == node->address) return depth;
for (auto const& child : node->children) {
int distance = GetLcaDepth(targetBusID, &child, depth + 1);
if (distance != -1)
return distance;
}
return -1;
}
// Function to extract the bus number from a PCIe address (domain:bus:device.function)
static int ExtractBusNumber(std::string const& pcieAddress)
{
int domain, bus, device, function;
char delimiter;
std::istringstream iss(pcieAddress);
iss >> std::hex >> domain >> delimiter >> bus >> delimiter >> device >> delimiter >> function;
if (iss.fail()) {
#ifdef VERBS_DEBUG
printf("Invalid PCIe address format: %s\n", pcieAddress.c_str());
#endif
return -1;
}
return bus;
}
// Function to compute the distance between two bus IDs
static int GetBusIdDistance(std::string const& pcieAddress1,
std::string const& pcieAddress2)
{
int bus1 = ExtractBusNumber(pcieAddress1);
int bus2 = ExtractBusNumber(pcieAddress2);
return (bus1 < 0 || bus2 < 0) ? -1 : std::abs(bus1 - bus2);
}
// Given a target busID and a set of candidate devices, returns a set of indices
// that is "closest" to the target
static std::set<int> GetNearestDevicesInTree(std::string const& targetBusId,
std::vector<std::string> const& candidateBusIdList)
{
int maxDepth = -1;
int minDistance = std::numeric_limits<int>::max();
std::set<int> matches = {};
// Loop over the candidates to find the ones with the lowest common ancestor (LCA)
for (int i = 0; i < candidateBusIdList.size(); i++) {
std::string const& candidateBusId = candidateBusIdList[i];
if (candidateBusId == "") continue;
PCIeNode const* lca = GetLcaBetweenNodes(GetPCIeTreeRoot(), targetBusId, candidateBusId);
if (!lca) continue;
int depth = GetLcaDepth(lca->address, GetPCIeTreeRoot());
int currDistance = GetBusIdDistance(targetBusId, candidateBusId);
// When more than one LCA match is found, choose the one with smallest busId difference
// NOTE: currDistance could be -1, which signals problem with parsing, however still
// remains a valid "closest" candidate, so is included
if (depth > maxDepth || (depth == maxDepth && depth >= 0 && currDistance < minDistance)) {
maxDepth = depth;
matches.clear();
matches.insert(i);
minDistance = currDistance;
} else if (depth == maxDepth && depth >= 0 && currDistance == minDistance) {
matches.insert(i);
}
}
return matches;
}
#endif // NIC_EXEC_ENABLED
#ifdef NIC_EXEC_ENABLED
// IB Verbs-related functions
//========================================================================================
// Create a queue pair
static ErrResult CreateQueuePair(ConfigOptions const& cfg,
struct ibv_pd* pd,
struct ibv_cq* cq,
struct ibv_qp*& qp)
{
// Set queue pair attributes
struct ibv_qp_init_attr attr = {};
attr.qp_type = IBV_QPT_RC; // Set type to reliable connection
attr.send_cq = cq; // Send completion queue
attr.recv_cq = cq; // Recv completion queue
attr.cap.max_send_wr = cfg.nic.maxSendWorkReq; // Max send work requests
attr.cap.max_recv_wr = cfg.nic.maxRecvWorkReq; // Max recv work requests
attr.cap.max_send_sge = 1; // Max send scatter-gather entries
attr.cap.max_recv_sge = 1; // Max recv scatter-gather entries
qp = ibv_create_qp(pd, &attr);
if (qp == NULL)
return {ERR_FATAL, "Error while creating QP"};
return ERR_NONE;
}
// Initialize a queue pair
static ErrResult InitQueuePair(struct ibv_qp* qp,
uint8_t port,
unsigned flags)
{
struct ibv_qp_attr attr = {}; // Clear all attributes
attr.qp_state = IBV_QPS_INIT; // Set the QP state to INIT
attr.pkey_index = 0; // Set the partition key index to 0
attr.port_num = port; // Set the port number to the defined IB_PORT
attr.qp_access_flags = flags; // Set the QP access flags to the provided flags
int ret = ibv_modify_qp(qp, &attr,
IBV_QP_STATE | // Modify the QP state
IBV_QP_PKEY_INDEX | // Modify the partition key index
IBV_QP_PORT | // Modify the port number
IBV_QP_ACCESS_FLAGS); // Modify the access flags
if (ret != 0)
return {ERR_FATAL, "Error during QP Init. IB Verbs Error code: %d", ret};
return ERR_NONE;
}
// Transition QueuePair to Ready to Receive State
static ErrResult TransitionQpToRtr(ibv_qp* qp,
uint16_t const& dlid,
uint32_t const& dqpn,
ibv_gid const& gid,
uint8_t const& gidIndex,
uint8_t const& port,
bool const& isRoCE,
ibv_mtu const& mtu)
{
// Prepare QP attributes
struct ibv_qp_attr attr = {};
attr.qp_state = IBV_QPS_RTR;
attr.path_mtu = mtu;
attr.rq_psn = 0;
attr.max_dest_rd_atomic = 1;
attr.min_rnr_timer = 12;
if (isRoCE) {
attr.ah_attr.is_global = 1;
attr.ah_attr.grh.dgid.global.subnet_prefix = gid.global.subnet_prefix;
attr.ah_attr.grh.dgid.global.interface_id = gid.global.interface_id;
attr.ah_attr.grh.flow_label = 0;
attr.ah_attr.grh.sgid_index = gidIndex;
attr.ah_attr.grh.hop_limit = 255;
} else {
attr.ah_attr.is_global = 0;
attr.ah_attr.dlid = dlid;
}
attr.ah_attr.sl = 0;
attr.ah_attr.src_path_bits = 0;
attr.ah_attr.port_num = port;
attr.dest_qp_num = dqpn;
// Modify the QP
int ret = ibv_modify_qp(qp, &attr,
IBV_QP_STATE |
IBV_QP_AV |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC |
IBV_QP_MIN_RNR_TIMER);
if (ret != 0)
return {ERR_FATAL, "Error during QP RTR. IB Verbs Error code: %d", ret};
return ERR_NONE;
}
// Transition QueuePair to Ready to Send state
static ErrResult TransitionQpToRts(struct ibv_qp *qp)
{
struct ibv_qp_attr attr = {};
attr.qp_state = IBV_QPS_RTS;
attr.sq_psn = 0;
attr.timeout = 14;
attr.retry_cnt = 7;
attr.rnr_retry = 7;
attr.max_rd_atomic = 1;
int ret = ibv_modify_qp(qp, &attr,
IBV_QP_STATE |
IBV_QP_TIMEOUT |
IBV_QP_RETRY_CNT |
IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN |
IBV_QP_MAX_QP_RD_ATOMIC);
if (ret != 0)
return {ERR_FATAL, "Error during QP RTS. IB Verbs Error code: %d", ret};
return ERR_NONE;
}
static bool IsConfiguredGid(union ibv_gid* gid)
{
const struct in6_addr *a = (struct in6_addr *)gid->raw;
int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
if (((a->s6_addr32[0] | trailer) == 0UL) ||
((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
return false;
}
return true;
}
static bool LinkLocalGid(union ibv_gid* gid)
{
const struct in6_addr *a = (struct in6_addr *)gid->raw;
if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
return true;
}
return false;
}
static bool IsValidGid(union ibv_gid* gid)
{
return (IsConfiguredGid(gid) && !LinkLocalGid(gid));
}
static sa_family_t GetGidAddressFamily(union ibv_gid* gid)
{
const struct in6_addr *a = (struct in6_addr *)gid->raw;
bool isIpV4Mapped = ((a->s6_addr32[0] | a->s6_addr32[1]) |
(a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL;
bool isIpV4MappedMulticast = (a->s6_addr32[0] == htonl(0xff0e0000) &&
((a->s6_addr32[1] |
(a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL));
return (isIpV4Mapped || isIpV4MappedMulticast) ? AF_INET : AF_INET6;
}
static bool MatchGidAddressFamily(sa_family_t const& af,
void* prefix,
int prefixLen,
union ibv_gid* gid)
{
struct in_addr *base = NULL;
struct in6_addr *base6 = NULL;
struct in6_addr *addr6 = NULL;;
if (af == AF_INET) {
base = (struct in_addr *)prefix;
} else {
base6 = (struct in6_addr *)prefix;
}
addr6 = (struct in6_addr *)gid->raw;
#define NETMASK(bits) (htonl(0xffffffff ^ ((1 << (32 - bits)) - 1)))
int i = 0;
while (prefixLen > 0 && i < 4) {
if (af == AF_INET) {
int mask = NETMASK(prefixLen);
if ((base->s_addr & mask) ^ (addr6->s6_addr32[3] & mask))
break;
prefixLen = 0;
break;
} else {
if (prefixLen >= 32) {
if (base6->s6_addr32[i] ^ addr6->s6_addr32[i])
break;
prefixLen -= 32;
++i;
} else {
int mask = NETMASK(prefixLen);
if ((base6->s6_addr32[i] & mask) ^ (addr6->s6_addr32[i] & mask))
break;
prefixLen = 0;
}
}
}
return (prefixLen == 0) ? true : false;
#undef NETMASK
}
static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
int const& portNum,
int const& gidIndex,
int* version)
{
char const* deviceName = ibv_get_device_name(context->device);
char gidRoceVerStr[16] = {};
char roceTypePath[PATH_MAX] = {};
sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
deviceName, portNum, gidIndex);
int fd = open(roceTypePath, O_RDONLY);
if (fd == -1)
return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath};
int ret = read(fd, gidRoceVerStr, 15);
close(fd);
if (ret == -1)
return {ERR_FATAL, "Failed while reading RoCE version"};
if (strlen(gidRoceVerStr)) {
if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0
|| strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
*version = 1;
}
else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
*version = 2;
}
}
return ERR_NONE;
}
static ErrResult GetGidIndex(ConfigOptions const& cfg,
struct ibv_context* context,
int const& gidTblLen,
int& gidIndex)
{
// Use GID index if user specified
if (gidIndex >= 0) return ERR_NONE;
// Try to find the best GID index
int port = cfg.nic.ibPort;
sa_family_t targetAddrFam = (cfg.nic.ipAddressFamily == 6)? AF_INET6 : AF_INET;
int targetRoCEVer = cfg.nic.roceVersion;
// Initially assume gidIndex = 0
int gidIndexCurr = 0;
union ibv_gid gidCurr;
IBV_CALL(ibv_query_gid, context, port, gidIndexCurr, &gidCurr);
sa_family_t gidCurrFam = GetGidAddressFamily(&gidCurr);
bool gidCurrIsValid = IsValidGid(&gidCurr);
int gidCurrRoceVersion;
ERR_CHECK(GetRoceVersionNumber(context, port, gidIndexCurr, &gidCurrRoceVersion));
// Loop over GID table to find the best match
for (int gidIndexTest = 1; gidIndexTest < gidTblLen; ++gidIndexTest) {
union ibv_gid gidTest;
IBV_CALL(ibv_query_gid, context, cfg.nic.ibPort, gidIndexTest, &gidTest);
if (!IsValidGid(&gidTest)) continue;
sa_family_t gidTestFam = GetGidAddressFamily(&gidTest);
bool gidTestMatchSubnet = MatchGidAddressFamily(targetAddrFam, NULL, 0, &gidTest);
int gidTestRoceVersion;
ERR_CHECK(GetRoceVersionNumber(context, port, gidIndexTest, &gidTestRoceVersion));
if (!gidCurrIsValid ||
(gidTestFam == targetAddrFam && gidTestMatchSubnet &&
(gidCurrFam != targetAddrFam || gidTestRoceVersion == targetRoCEVer))) {
// Switch to better match
gidIndexCurr = gidIndexTest;
gidCurrFam = gidTestFam;
gidCurrIsValid = true;
gidCurrRoceVersion = gidTestRoceVersion;
}
}
gidIndex = gidIndexCurr;
return ERR_NONE;
}
static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg,
ExeDevice const& srcExeDevice,
Transfer const& t,
TransferResources& rss)
{
// Switch to the closest NUMA node to this NIC
int numaNode = GetIbvDeviceList()[srcExeDevice.exeIndex].numaNode;
if (numaNode != -1)
numa_run_on_node(numaNode);
int const port = cfg.nic.ibPort;
// Figure out destination NIC (Accounts for possible remap due to use of EXE_NIC_NEAREST)
ExeDevice dstExeDevice;
ERR_CHECK(GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice));
rss.srcNicIndex = srcExeDevice.exeIndex;
rss.dstNicIndex = dstExeDevice.exeIndex;
rss.qpCount = t.numSubExecs;
// Check for valid NICs and active ports
int numNics = GetNumExecutors(EXE_NIC);
if (rss.srcNicIndex < 0 || rss.srcNicIndex >= numNics)
return {ERR_FATAL, "SRC NIC index is out of range (%d)", rss.srcNicIndex};
if (rss.dstNicIndex < 0 || rss.dstNicIndex >= numNics)
return {ERR_FATAL, "DST NIC index is out of range (%d)", rss.dstNicIndex};
if (!GetIbvDeviceList()[rss.srcNicIndex].hasActivePort)
return {ERR_FATAL, "SRC NIC %d is not active\n", rss.srcNicIndex};
if (!GetIbvDeviceList()[rss.dstNicIndex].hasActivePort)
return {ERR_FATAL, "DST NIC %d is not active\n", rss.dstNicIndex};
// Queue pair flags
unsigned int rdmaAccessFlags = (IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_REMOTE_WRITE |
IBV_ACCESS_REMOTE_ATOMIC);
unsigned int rdmaMemRegFlags = rdmaAccessFlags;
if (cfg.nic.useRelaxedOrder) rdmaMemRegFlags |= IBV_ACCESS_RELAXED_ORDERING;
// Open NIC contexts
IBV_PTR_CALL(rss.srcContext, ibv_open_device, GetIbvDeviceList()[rss.srcNicIndex].devicePtr);
IBV_PTR_CALL(rss.dstContext, ibv_open_device, GetIbvDeviceList()[rss.dstNicIndex].devicePtr);
// Open protection domains
IBV_PTR_CALL(rss.srcProtect, ibv_alloc_pd, rss.srcContext);
IBV_PTR_CALL(rss.dstProtect, ibv_alloc_pd, rss.dstContext);
// Register memory region
IBV_PTR_CALL(rss.srcMemRegion, ibv_reg_mr, rss.srcProtect, rss.srcMem[0], rss.numBytes, rdmaMemRegFlags);
IBV_PTR_CALL(rss.dstMemRegion, ibv_reg_mr, rss.dstProtect, rss.dstMem[0], rss.numBytes, rdmaMemRegFlags);
// Create completion queues
IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, cfg.nic.queueSize, NULL, NULL, 0);
IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, cfg.nic.queueSize, NULL, NULL, 0);
// Get port attributes
IBV_CALL(ibv_query_port, rss.srcContext, port, &rss.srcPortAttr);
IBV_CALL(ibv_query_port, rss.dstContext, port, &rss.dstPortAttr);
if (rss.srcPortAttr.link_layer != rss.dstPortAttr.link_layer)
return {ERR_FATAL, "SRC NIC (%d) and DST NIC (%d) do not have the same link layer", rss.srcNicIndex, rss.dstNicIndex};
// Prepare GID index
int srcGidIndex = cfg.nic.ibGidIndex;
int dstGidIndex = cfg.nic.ibGidIndex;
// Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately
bool isRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET);
if (isRoCE) {
// Try to auto-detect the GID index
ERR_CHECK(GetGidIndex(cfg, rss.srcContext, rss.srcPortAttr.gid_tbl_len, srcGidIndex));
ERR_CHECK(GetGidIndex(cfg, rss.dstContext, rss.dstPortAttr.gid_tbl_len, dstGidIndex));
IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid);
IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid);
}
// Prepare queue pairs and send elements
rss.srcQueuePairs.resize(rss.qpCount);
rss.dstQueuePairs.resize(rss.qpCount);
rss.sgePerQueuePair.resize(rss.qpCount);
rss.sendWorkRequests.resize(rss.qpCount);
for (int i = 0; i < rss.qpCount; ++i) {
// Create scatter-gather element for the portion of memory assigned to this queue pair
ibv_sge sg = {};
sg.addr = (uint64_t)rss.subExecParamCpu[i].src[0];
sg.length = rss.subExecParamCpu[i].N * sizeof(float);
sg.lkey = rss.srcMemRegion->lkey;
rss.sgePerQueuePair[i] = sg;
// Create send work request
ibv_send_wr wr = {};
wr.wr_id = i;
wr.sg_list = &rss.sgePerQueuePair[i];
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_WRITE;
wr.send_flags = IBV_SEND_SIGNALED;
wr.wr.rdma.remote_addr = (uint64_t)rss.subExecParamCpu[i].dst[0];
wr.wr.rdma.rkey = rss.dstMemRegion->rkey;
rss.sendWorkRequests[i] = wr;
// Create SRC/DST queue pairs
ERR_CHECK(CreateQueuePair(cfg, rss.srcProtect, rss.srcCompQueue, rss.srcQueuePairs[i]));
ERR_CHECK(CreateQueuePair(cfg, rss.dstProtect, rss.dstCompQueue, rss.dstQueuePairs[i]));
// Initialize SRC/DST queue pairs
ERR_CHECK(InitQueuePair(rss.srcQueuePairs[i], port, rdmaAccessFlags));
ERR_CHECK(InitQueuePair(rss.dstQueuePairs[i], port, rdmaAccessFlags));
// Transition the SRC queue pair to ready to receive
ERR_CHECK(TransitionQpToRtr(rss.srcQueuePairs[i], rss.dstPortAttr.lid,
rss.dstQueuePairs[i]->qp_num, rss.dstGid,
dstGidIndex, port, isRoCE,
rss.srcPortAttr.active_mtu));
// Transition the SRC queue pair to ready to send
ERR_CHECK(TransitionQpToRts(rss.srcQueuePairs[i]));
// Transition the DST queue pair to ready to receive
ERR_CHECK(TransitionQpToRtr(rss.dstQueuePairs[i], rss.srcPortAttr.lid,
rss.srcQueuePairs[i]->qp_num, rss.srcGid,
srcGidIndex, port, isRoCE,
rss.dstPortAttr.active_mtu));
// Transition the DST queue pair to ready to send
ERR_CHECK(TransitionQpToRts(rss.dstQueuePairs[i]));
}
return ERR_NONE;
}
static ErrResult TeardownNicTransferResources(TransferResources& rss)
{
// Deregister memory regions
IBV_CALL(ibv_dereg_mr, rss.srcMemRegion);
IBV_CALL(ibv_dereg_mr, rss.dstMemRegion);
// Destroy queue pairs
for (auto srcQueuePair : rss.srcQueuePairs)
IBV_CALL(ibv_destroy_qp, srcQueuePair);
rss.srcQueuePairs.clear();
for (auto dstQueuePair : rss.dstQueuePairs)
IBV_CALL(ibv_destroy_qp, dstQueuePair);
rss.dstQueuePairs.clear();
// Destroy completion queues
IBV_CALL(ibv_destroy_cq, rss.srcCompQueue);
IBV_CALL(ibv_destroy_cq, rss.dstCompQueue);
// Deallocate protection domains
IBV_CALL(ibv_dealloc_pd, rss.srcProtect);
IBV_CALL(ibv_dealloc_pd, rss.dstProtect);
// Destroy context
IBV_CALL(ibv_close_device, rss.srcContext);
IBV_CALL(ibv_close_device, rss.dstContext);
return ERR_NONE;
}
#endif // NIC_EXEC_ENABLED
// Data validation-related functions // Data validation-related functions
//======================================================================================== //========================================================================================
...@@ -1248,17 +2168,17 @@ namespace { ...@@ -1248,17 +2168,17 @@ namespace {
float* output; float* output;
size_t initOffset = cfg.data.byteOffset / sizeof(float); size_t initOffset = cfg.data.byteOffset / sizeof(float);
for (auto resource : transferResources) { for (auto rss : transferResources) {
int transferIdx = resource->transferIdx; int transferIdx = rss->transferIdx;
Transfer const& t = transfers[transferIdx]; Transfer const& t = transfers[transferIdx];
size_t N = t.numBytes / sizeof(float); size_t N = t.numBytes / sizeof(float);
float const* expected = dstReference[t.srcs.size()].data(); float const* expected = dstReference[t.srcs.size()].data();
for (int dstIdx = 0; dstIdx < resource->dstMem.size(); dstIdx++) { for (int dstIdx = 0; dstIdx < rss->dstMem.size(); dstIdx++) {
if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) { if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) {
output = (resource->dstMem[dstIdx]) + initOffset; output = (rss->dstMem[dstIdx]) + initOffset;
} else { } else {
ERR_CHECK(hipMemcpy(outputBuffer.data(), (resource->dstMem[dstIdx]) + initOffset, t.numBytes, hipMemcpyDefault)); ERR_CHECK(hipMemcpy(outputBuffer.data(), (rss->dstMem[dstIdx]) + initOffset, t.numBytes, hipMemcpyDefault));
ERR_CHECK(hipDeviceSynchronize()); ERR_CHECK(hipDeviceSynchronize());
output = outputBuffer.data(); output = outputBuffer.data();
} }
...@@ -1286,7 +2206,7 @@ namespace { ...@@ -1286,7 +2206,7 @@ namespace {
// Initializes counters // Initializes counters
static ErrResult PrepareSubExecParams(ConfigOptions const& cfg, static ErrResult PrepareSubExecParams(ConfigOptions const& cfg,
Transfer const& transfer, Transfer const& transfer,
TransferResources& resources) TransferResources& rss)
{ {
// Each subExecutor needs to know src/dst pointers and how many elements to transfer // Each subExecutor needs to know src/dst pointers and how many elements to transfer
// Figure out the sub-array each subExecutor works on for this Transfer // Figure out the sub-array each subExecutor works on for this Transfer
...@@ -1300,15 +2220,15 @@ namespace { ...@@ -1300,15 +2220,15 @@ namespace {
int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
(size_t)transfer.numSubExecs); (size_t)transfer.numSubExecs);
vector<SubExecParam>& subExecParam = resources.subExecParamCpu; vector<SubExecParam>& subExecParam = rss.subExecParamCpu;
subExecParam.clear(); subExecParam.clear();
subExecParam.resize(transfer.numSubExecs); subExecParam.resize(transfer.numSubExecs);
size_t assigned = 0; size_t assigned = 0;
for (int i = 0; i < transfer.numSubExecs; ++i) { for (int i = 0; i < transfer.numSubExecs; ++i) {
SubExecParam& p = subExecParam[i]; SubExecParam& p = subExecParam[i];
p.numSrcs = resources.srcMem.size(); p.numSrcs = rss.srcMem.size();
p.numDsts = resources.dstMem.size(); p.numDsts = rss.dstMem.size();
p.startCycle = 0; p.startCycle = 0;
p.stopCycle = 0; p.stopCycle = 0;
p.hwId = 0; p.hwId = 0;
...@@ -1319,8 +2239,8 @@ namespace { ...@@ -1319,8 +2239,8 @@ namespace {
p.N = N; p.N = N;
p.teamSize = transfer.numSubExecs; p.teamSize = transfer.numSubExecs;
p.teamIdx = i; p.teamIdx = i;
for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = resources.srcMem[iSrc] + initOffset; for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = rss.srcMem[iSrc] + initOffset;
for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = resources.dstMem[iDst] + initOffset; for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = rss.dstMem[iDst] + initOffset;
} else { } else {
// Otherwise, each subexecutor works on separate subarrays // Otherwise, each subexecutor works on separate subarrays
int const subExecLeft = std::max(0, maxSubExecToUse - i); int const subExecLeft = std::max(0, maxSubExecToUse - i);
...@@ -1330,8 +2250,8 @@ namespace { ...@@ -1330,8 +2250,8 @@ namespace {
p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0; p.N = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
p.teamSize = 1; p.teamSize = 1;
p.teamIdx = 0; p.teamIdx = 0;
for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = resources.srcMem[iSrc] + initOffset + assigned; for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = rss.srcMem[iSrc] + initOffset + assigned;
for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = resources.dstMem[iDst] + initOffset + assigned; for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = rss.dstMem[iDst] + initOffset + assigned;
assigned += p.N; assigned += p.N;
} }
...@@ -1352,7 +2272,7 @@ namespace { ...@@ -1352,7 +2272,7 @@ namespace {
} }
// Clear counters // Clear counters
resources.totalDurationMsec = 0.0; rss.totalDurationMsec = 0.0;
return ERR_NONE; return ERR_NONE;
} }
...@@ -1367,12 +2287,12 @@ namespace { ...@@ -1367,12 +2287,12 @@ namespace {
exeInfo.totalDurationMsec = 0.0; exeInfo.totalDurationMsec = 0.0;
// Loop over each transfer this executor is involved in // Loop over each transfer this executor is involved in
for (auto& resources : exeInfo.resources) { for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[resources.transferIdx]; Transfer const& t = transfers[rss.transferIdx];
resources.numBytes = t.numBytes; rss.numBytes = t.numBytes;
// Allocate source memory // Allocate source memory
resources.srcMem.resize(t.srcs.size()); rss.srcMem.resize(t.srcs.size());
for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) { for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
MemDevice const& srcMemDevice = t.srcs[iSrc]; MemDevice const& srcMemDevice = t.srcs[iSrc];
...@@ -1381,11 +2301,11 @@ namespace { ...@@ -1381,11 +2301,11 @@ namespace {
srcMemDevice.memIndex != exeDevice.exeIndex) { srcMemDevice.memIndex != exeDevice.exeIndex) {
ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, srcMemDevice.memIndex)); ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, srcMemDevice.memIndex));
} }
ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&resources.srcMem[iSrc])); ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc]));
} }
// Allocate destination memory // Allocate destination memory
resources.dstMem.resize(t.dsts.size()); rss.dstMem.resize(t.dsts.size());
for (int iDst = 0; iDst < t.dsts.size(); ++iDst) { for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
MemDevice const& dstMemDevice = t.dsts[iDst]; MemDevice const& dstMemDevice = t.dsts[iDst];
...@@ -1394,7 +2314,7 @@ namespace { ...@@ -1394,7 +2314,7 @@ namespace {
dstMemDevice.memIndex != exeDevice.exeIndex) { dstMemDevice.memIndex != exeDevice.exeIndex) {
ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, dstMemDevice.memIndex)); ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, dstMemDevice.memIndex));
} }
ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&resources.dstMem[iDst])); ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst]));
} }
if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) { if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
...@@ -1402,22 +2322,22 @@ namespace { ...@@ -1402,22 +2322,22 @@ namespace {
// Collect HSA agent information // Collect HSA agent information
hsa_amd_pointer_info_t info; hsa_amd_pointer_info_t info;
info.size = sizeof(info); info.size = sizeof(info);
ERR_CHECK(hsa_amd_pointer_info(resources.dstMem[0], &info, NULL, NULL, NULL)); ERR_CHECK(hsa_amd_pointer_info(rss.dstMem[0], &info, NULL, NULL, NULL));
resources.dstAgent = info.agentOwner; rss.dstAgent = info.agentOwner;
ERR_CHECK(hsa_amd_pointer_info(resources.srcMem[0], &info, NULL, NULL, NULL)); ERR_CHECK(hsa_amd_pointer_info(rss.srcMem[0], &info, NULL, NULL, NULL));
resources.srcAgent = info.agentOwner; rss.srcAgent = info.agentOwner;
// Create HSA completion signal // Create HSA completion signal
ERR_CHECK(hsa_signal_create(1, 0, NULL, &resources.signal)); ERR_CHECK(hsa_signal_create(1, 0, NULL, &rss.signal));
if (t.exeSubIndex != -1) if (t.exeSubIndex != -1)
resources.sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex); rss.sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
#endif #endif
} }
// Prepare subexecutor parameters // Prepare subexecutor parameters
ERR_CHECK(PrepareSubExecParams(cfg, t, resources)); ERR_CHECK(PrepareSubExecParams(cfg, t, rss));
} }
// Prepare additional requirements for GPU-based executors // Prepare additional requirements for GPU-based executors
...@@ -1476,11 +2396,11 @@ namespace { ...@@ -1476,11 +2396,11 @@ namespace {
exeDevice.exeIndex)); exeDevice.exeIndex));
#endif #endif
int transferOffset = 0; int transferOffset = 0;
for (auto& resources : exeInfo.resources) { for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[resources.transferIdx]; Transfer const& t = transfers[rss.transferIdx];
resources.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset; rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
for (auto p : resources.subExecParamCpu) { for (auto p : rss.subExecParamCpu) {
resources.subExecIdx.push_back(exeInfo.subExecParamCpu.size()); rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
exeInfo.subExecParamCpu.push_back(p); exeInfo.subExecParamCpu.push_back(p);
transferOffset++; transferOffset++;
} }
...@@ -1495,6 +2415,17 @@ namespace { ...@@ -1495,6 +2415,17 @@ namespace {
ERR_CHECK(hipDeviceSynchronize()); ERR_CHECK(hipDeviceSynchronize());
} }
// Prepare for NIC-based executors
if (IsNicExeType(exeDevice.exeType)) {
#ifdef NIC_EXEC_ENABLED
for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
ERR_CHECK(PrepareNicTransferResources(cfg, exeDevice, t, rss));
}
#else
return {ERR_FATAL, "RDMA executor is not supported"};
#endif
}
return ERR_NONE; return ERR_NONE;
} }
...@@ -1508,23 +2439,30 @@ namespace { ...@@ -1508,23 +2439,30 @@ namespace {
ExeInfo& exeInfo) ExeInfo& exeInfo)
{ {
// Loop over each transfer this executor is involved in // Loop over each transfer this executor is involved in
for (auto& resources : exeInfo.resources) { for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[resources.transferIdx]; Transfer const& t = transfers[rss.transferIdx];
// Deallocate source memory // Deallocate source memory
for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) { for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, resources.srcMem[iSrc], t.numBytes + cfg.data.byteOffset)); ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, rss.srcMem[iSrc], t.numBytes + cfg.data.byteOffset));
} }
// Deallocate destination memory // Deallocate destination memory
for (int iDst = 0; iDst < t.dsts.size(); ++iDst) { for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, resources.dstMem[iDst], t.numBytes + cfg.data.byteOffset)); ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, rss.dstMem[iDst], t.numBytes + cfg.data.byteOffset));
} }
// Destroy HSA signal for DMA executor // Destroy HSA signal for DMA executor
#if !defined(__NVCC__) #if !defined(__NVCC__)
if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) { if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
ERR_CHECK(hsa_signal_destroy(resources.signal)); ERR_CHECK(hsa_signal_destroy(rss.signal));
}
#endif
// Destroy NIC related resources
#ifdef NIC_EXEC_ENABLED
if (IsNicExeType(exeDevice.exeType)) {
ERR_CHECK(TeardownNicTransferResources(rss));
} }
#endif #endif
} }
...@@ -1557,68 +2495,69 @@ namespace { ...@@ -1557,68 +2495,69 @@ namespace {
//======================================================================================== //========================================================================================
// Kernel for CPU execution (run by a single subexecutor) // Kernel for CPU execution (run by a single subexecutor)
static void CpuReduceKernel(SubExecParam const& p) static void CpuReduceKernel(SubExecParam const& p, int numSubIterations)
{ {
if (p.N == 0) return; if (p.N == 0) return;
int const& numSrcs = p.numSrcs; int subIteration = 0;
int const& numDsts = p.numDsts; do {
int const& numSrcs = p.numSrcs;
if (numSrcs == 0) { int const& numDsts = p.numDsts;
for (int i = 0; i < numDsts; ++i) {
memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
//for (int j = 0; j < p.N; j++) p.dst[i][j] = MEMSET_VAL;
}
} else if (numSrcs == 1) {
float const* __restrict__ src = p.src[0];
if (numDsts == 0) {
float sum = 0.0;
for (int j = 0; j < p.N; j++)
sum += p.src[0][j];
// Add a dummy check to ensure the read is not optimized out if (numSrcs == 0) {
if (sum != sum) { for (int i = 0; i < numDsts; ++i) {
printf("[ERROR] Nan detected\n"); memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
//for (int j = 0; j < p.N; j++) p.dst[i][j] = MEMSET_VAL;
}
} else if (numSrcs == 1) {
float const* __restrict__ src = p.src[0];
if (numDsts == 0) {
float sum = 0.0;
for (int j = 0; j < p.N; j++)
sum += p.src[0][j];
// Add a dummy check to ensure the read is not optimized out
if (sum != sum) {
printf("[ERROR] Nan detected\n");
}
} else {
for (int i = 0; i < numDsts; ++i)
memcpy(p.dst[i], src, p.N * sizeof(float));
} }
} else { } else {
for (int i = 0; i < numDsts; ++i) float sum = 0.0f;
memcpy(p.dst[i], src, p.N * sizeof(float)); for (int j = 0; j < p.N; j++) {
} sum = p.src[0][j];
} else { for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
float sum = 0.0f; for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
for (int j = 0; j < p.N; j++) { }
sum = p.src[0][j];
for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
} }
} } while (++subIteration != numSubIterations);
} }
// Execution of a single CPU Transfers // Execution of a single CPU Transfers
static void ExecuteCpuTransfer(int const iteration, static void ExecuteCpuTransfer(int const iteration,
ConfigOptions const& cfg, ConfigOptions const& cfg,
int const exeIndex, int const exeIndex,
TransferResources& resources) TransferResources& rss)
{ {
auto cpuStart = std::chrono::high_resolution_clock::now(); auto cpuStart = std::chrono::high_resolution_clock::now();
vector<std::thread> childThreads; vector<std::thread> childThreads;
int subIteration = 0;
do {
for (auto const& subExecParam : resources.subExecParamCpu)
childThreads.emplace_back(std::thread(CpuReduceKernel, std::cref(subExecParam)));
for (auto& subExecThread : childThreads) for (auto const& subExecParam : rss.subExecParamCpu)
subExecThread.join(); childThreads.emplace_back(std::thread(CpuReduceKernel, std::cref(subExecParam), cfg.general.numSubIterations));
childThreads.clear();
} while (++subIteration != cfg.general.numSubIterations); for (auto& subExecThread : childThreads)
subExecThread.join();
childThreads.clear();
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) { if (iteration >= 0) {
resources.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) if (cfg.general.recordPerIteration)
resources.perIterMsec.push_back(deltaMsec); rss.perIterMsec.push_back(deltaMsec);
} }
} }
...@@ -1632,12 +2571,12 @@ namespace { ...@@ -1632,12 +2571,12 @@ namespace {
auto cpuStart = std::chrono::high_resolution_clock::now(); auto cpuStart = std::chrono::high_resolution_clock::now();
vector<std::thread> asyncTransfers; vector<std::thread> asyncTransfers;
for (auto& resource : exeInfo.resources) { for (auto& rss : exeInfo.resources) {
asyncTransfers.emplace_back(std::thread(ExecuteCpuTransfer, asyncTransfers.emplace_back(std::thread(ExecuteCpuTransfer,
iteration, iteration,
std::cref(cfg), std::cref(cfg),
exeIndex, exeIndex,
std::ref(resource))); std::ref(rss)));
} }
for (auto& asyncTransfer : asyncTransfers) for (auto& asyncTransfer : asyncTransfers)
asyncTransfer.join(); asyncTransfer.join();
...@@ -1649,6 +2588,90 @@ namespace { ...@@ -1649,6 +2588,90 @@ namespace {
return ERR_NONE; return ERR_NONE;
} }
#ifdef NIC_EXEC_ENABLED
// Execution of a single NIC Transfer
static ErrResult ExecuteNicTransfer(int const iteration,
ConfigOptions const& cfg,
int const exeIndex,
TransferResources& rss)
{
auto cpuStart = std::chrono::high_resolution_clock::now();
// Switch to the closest NUMA node to this NIC
if (cfg.nic.useNuma) {
int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
if (numaNode != -1)
numa_run_on_node(numaNode);
}
int subIteration = 0;
do {
// Loop over each of the queue pairs and post the send
ibv_send_wr* badWorkReq;
for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
int error = ibv_post_send(rss.srcQueuePairs[qpIndex], &rss.sendWorkRequests[qpIndex], &badWorkReq);
if (error)
return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n",
rss.transferIdx, qpIndex, error};
}
// Poll the completion queue until all queue pairs are complete
// The order of completion doesn't matter because this completion queue is dedicated to this Transfer
int numComplete = 0;
ibv_wc wc;
while (numComplete < rss.qpCount) {
int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc);
if (nc > 0) {
numComplete++;
if (wc.status != IBV_WC_SUCCESS) {
return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion", rss.transferIdx};
}
} else if (nc < 0) {
return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx};
}
}
} while (++subIteration != cfg.general.numSubIterations);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0) {
rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration)
rss.perIterMsec.push_back(deltaMsec);
}
return ERR_NONE;
}
// Execution of a single NIC executor
static ErrResult RunNicExecutor(int const iteration,
ConfigOptions const& cfg,
int const exeIndex,
ExeInfo& exeInfo)
{
vector<std::future<ErrResult>> asyncTransfers;
auto cpuStart = std::chrono::high_resolution_clock::now();
for (int i = 0; i < exeInfo.resources.size(); i++) {
asyncTransfers.emplace_back(std::async(std::launch::async,
ExecuteNicTransfer,
iteration,
std::cref(cfg),
exeIndex,
std::ref(exeInfo.resources[i])));
}
for (auto& asyncTransfer : asyncTransfers)
ERR_CHECK(asyncTransfer.get());
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
if (iteration >= 0)
exeInfo.totalDurationMsec += deltaMsec;
return ERR_NONE;
}
#endif
// GFX Executor-related functions // GFX Executor-related functions
//======================================================================================== //========================================================================================
...@@ -1789,7 +2812,7 @@ namespace { ...@@ -1789,7 +2812,7 @@ namespace {
if (numSrcs == 0) val = MemsetVal<float>(); if (numSrcs == 0) val = MemsetVal<float>();
size_t const loop3Stride = nTeams * nWaves * warpSize; size_t const loop3Stride = nTeams * nWaves * warpSize;
for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) { for ( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
if (numSrcs) { if (numSrcs) {
val = p.src[0][idx]; val = p.src[0][idx];
for (int s = 1; s < numSrcs; s++) for (int s = 1; s < numSrcs; s++)
...@@ -1848,11 +2871,11 @@ namespace { ...@@ -1848,11 +2871,11 @@ namespace {
hipEvent_t const stopEvent, hipEvent_t const stopEvent,
int const xccDim, int const xccDim,
ConfigOptions const& cfg, ConfigOptions const& cfg,
TransferResources& resources) TransferResources& rss)
{ {
auto cpuStart = std::chrono::high_resolution_clock::now(); auto cpuStart = std::chrono::high_resolution_clock::now();
int numSubExecs = resources.subExecParamCpu.size(); int numSubExecs = rss.subExecParamCpu.size();
dim3 const gridSize(xccDim, numSubExecs, 1); dim3 const gridSize(xccDim, numSubExecs, 1);
dim3 const blockSize(cfg.gfx.blockSize, 1); dim3 const blockSize(cfg.gfx.blockSize, 1);
...@@ -1862,13 +2885,13 @@ namespace { ...@@ -1862,13 +2885,13 @@ namespace {
GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1] GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
<<<gridSize, blockSize, 0, stream>>> <<<gridSize, blockSize, 0, stream>>>
(resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations); (rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
if (stopEvent != NULL) if (stopEvent != NULL)
ERR_CHECK(hipEventRecord(stopEvent, stream)); ERR_CHECK(hipEventRecord(stopEvent, stream));
#else #else
hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1], hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
gridSize, blockSize, 0, stream, startEvent, stopEvent, gridSize, blockSize, 0, stream, startEvent, stopEvent,
0, resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations); 0, rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
#endif #endif
ERR_CHECK(hipStreamSynchronize(stream)); ERR_CHECK(hipStreamSynchronize(stream));
...@@ -1883,15 +2906,15 @@ namespace { ...@@ -1883,15 +2906,15 @@ namespace {
ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
deltaMsec = gpuDeltaMsec; deltaMsec = gpuDeltaMsec;
} }
resources.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
resources.perIterMsec.push_back(deltaMsec); rss.perIterMsec.push_back(deltaMsec);
std::set<std::pair<int,int>> CUs; std::set<std::pair<int,int>> CUs;
for (int i = 0; i < numSubExecs; i++) { for (int i = 0; i < numSubExecs; i++) {
CUs.insert(std::make_pair(resources.subExecParamGpuPtr[i].xccId, CUs.insert(std::make_pair(rss.subExecParamGpuPtr[i].xccId,
GetId(resources.subExecParamGpuPtr[i].hwId))); GetId(rss.subExecParamGpuPtr[i].hwId)));
} }
resources.perIterCUs.push_back(CUs); rss.perIterCUs.push_back(CUs);
} }
} }
return ERR_NONE; return ERR_NONE;
...@@ -1965,12 +2988,12 @@ namespace { ...@@ -1965,12 +2988,12 @@ namespace {
// Determine timing for each of the individual transfers that were part of this launch // Determine timing for each of the individual transfers that were part of this launch
if (!cfg.gfx.useMultiStream) { if (!cfg.gfx.useMultiStream) {
for (int i = 0; i < exeInfo.resources.size(); i++) { for (int i = 0; i < exeInfo.resources.size(); i++) {
TransferResources& resources = exeInfo.resources[i]; TransferResources& rss = exeInfo.resources[i];
long long minStartCycle = std::numeric_limits<long long>::max(); long long minStartCycle = std::numeric_limits<long long>::max();
long long maxStopCycle = std::numeric_limits<long long>::min(); long long maxStopCycle = std::numeric_limits<long long>::min();
std::set<std::pair<int, int>> CUs; std::set<std::pair<int, int>> CUs;
for (auto subExecIdx : resources.subExecIdx) { for (auto subExecIdx : rss.subExecIdx) {
minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle); minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle); maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle);
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
...@@ -1980,10 +3003,10 @@ namespace { ...@@ -1980,10 +3003,10 @@ namespace {
} }
double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate); double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
resources.totalDurationMsec += deltaMsec; rss.totalDurationMsec += deltaMsec;
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
resources.perIterMsec.push_back(deltaMsec); rss.perIterMsec.push_back(deltaMsec);
resources.perIterCUs.push_back(CUs); rss.perIterCUs.push_back(CUs);
} }
} }
} }
...@@ -1991,7 +3014,6 @@ namespace { ...@@ -1991,7 +3014,6 @@ namespace {
return ERR_NONE; return ERR_NONE;
} }
// DMA Executor-related functions // DMA Executor-related functions
//======================================================================================== //========================================================================================
...@@ -2106,6 +3128,9 @@ namespace { ...@@ -2106,6 +3128,9 @@ namespace {
case EXE_CPU: return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); case EXE_CPU: return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo); case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
#ifdef NIC_EXEC_ENABLED
case EXE_NIC: return RunNicExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
#endif
default: return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType}; default: return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
} }
} }
...@@ -2182,16 +3207,17 @@ namespace { ...@@ -2182,16 +3207,17 @@ namespace {
std::map<ExeDevice, ExeInfo> executorMap; std::map<ExeDevice, ExeInfo> executorMap;
for (int i = 0; i < transfers.size(); i++) { for (int i = 0; i < transfers.size(); i++) {
Transfer const& t = transfers[i]; Transfer const& t = transfers[i];
ExeDevice exeDevice;
ExeInfo& exeInfo = executorMap[t.exeDevice]; ERR_APPEND(GetActualExecutor(cfg, t.exeDevice, exeDevice), errResults);
exeInfo.totalBytes += t.numBytes;
exeInfo.totalSubExecs += t.numSubExecs;
exeInfo.useSubIndices |= (t.exeSubIndex != -1);
TransferResources resource = {}; TransferResources resource = {};
resource.transferIdx = i; resource.transferIdx = i;
exeInfo.resources.push_back(resource);
ExeInfo& exeInfo = executorMap[exeDevice];
exeInfo.totalBytes += t.numBytes;
exeInfo.totalSubExecs += t.numSubExecs;
exeInfo.useSubIndices |= (t.exeSubIndex != -1 || (t.exeDevice.exeType == EXE_GPU_GFX && !cfg.gfx.prefXccTable.empty()));
exeInfo.resources.push_back(resource);
minNumSrcs = std::min(minNumSrcs, (int)t.srcs.size()); minNumSrcs = std::min(minNumSrcs, (int)t.srcs.size());
maxNumSrcs = std::max(maxNumSrcs, (int)t.srcs.size()); maxNumSrcs = std::max(maxNumSrcs, (int)t.srcs.size());
maxNumBytes = std::max(maxNumBytes, t.numBytes); maxNumBytes = std::max(maxNumBytes, t.numBytes);
...@@ -2322,8 +3348,8 @@ namespace { ...@@ -2322,8 +3348,8 @@ namespace {
results.tfrResults.resize(transfers.size()); results.tfrResults.resize(transfers.size());
results.numTimedIterations = numTimedIterations; results.numTimedIterations = numTimedIterations;
results.totalBytesTransferred = 0; results.totalBytesTransferred = 0;
results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / numTimedIterations; results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / (numTimedIterations * cfg.general.numSubIterations);
results.overheadMsec = 0.0; results.overheadMsec = results.avgTotalDurationMsec;
for (auto& exeInfoPair : executorMap) { for (auto& exeInfoPair : executorMap) {
ExeDevice const& exeDevice = exeInfoPair.first; ExeDevice const& exeDevice = exeInfoPair.first;
ExeInfo& exeInfo = exeInfoPair.second; ExeInfo& exeInfo = exeInfoPair.second;
...@@ -2331,25 +3357,32 @@ namespace { ...@@ -2331,25 +3357,32 @@ namespace {
// Copy over executor results // Copy over executor results
ExeResult& exeResult = results.exeResults[exeDevice]; ExeResult& exeResult = results.exeResults[exeDevice];
exeResult.numBytes = exeInfo.totalBytes; exeResult.numBytes = exeInfo.totalBytes;
exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations; exeResult.avgDurationMsec = exeInfo.totalDurationMsec / (numTimedIterations * cfg.general.numSubIterations);
exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec; exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec;
exeResult.sumBandwidthGbPerSec = 0.0; exeResult.sumBandwidthGbPerSec = 0.0;
exeResult.transferIdx.clear(); exeResult.transferIdx.clear();
results.totalBytesTransferred += exeInfo.totalBytes; results.totalBytesTransferred += exeInfo.totalBytes;
results.overheadMsec = std::max(results.overheadMsec, (results.avgTotalDurationMsec - results.overheadMsec = std::min(results.overheadMsec, (results.avgTotalDurationMsec -
exeResult.avgDurationMsec)); exeResult.avgDurationMsec));
// Copy over transfer results // Copy over transfer results
for (auto const& resources : exeInfo.resources) { for (auto const& rss : exeInfo.resources) {
int const transferIdx = resources.transferIdx; int const transferIdx = rss.transferIdx;
TransferResult& tfrResult = results.tfrResults[transferIdx];
exeResult.transferIdx.push_back(transferIdx); exeResult.transferIdx.push_back(transferIdx);
tfrResult.numBytes = resources.numBytes;
tfrResult.avgDurationMsec = resources.totalDurationMsec / numTimedIterations; TransferResult& tfrResult = results.tfrResults[transferIdx];
tfrResult.avgBandwidthGbPerSec = (resources.numBytes / 1.0e6) / tfrResult.avgDurationMsec; tfrResult.exeDevice = exeDevice;
#ifdef NIC_EXEC_ENABLED
tfrResult.exeDstDevice = {exeDevice.exeType, rss.dstNicIndex};
#else
tfrResult.exeDstDevice = exeDevice;
#endif
tfrResult.numBytes = rss.numBytes;
tfrResult.avgDurationMsec = rss.totalDurationMsec / numTimedIterations;
tfrResult.avgBandwidthGbPerSec = (rss.numBytes / 1.0e6) / tfrResult.avgDurationMsec;
if (cfg.general.recordPerIteration) { if (cfg.general.recordPerIteration) {
tfrResult.perIterMsec = resources.perIterMsec; tfrResult.perIterMsec = rss.perIterMsec;
tfrResult.perIterCUs = resources.perIterCUs; tfrResult.perIterCUs = rss.perIterCUs;
} }
exeResult.sumBandwidthGbPerSec += tfrResult.avgBandwidthGbPerSec; exeResult.sumBandwidthGbPerSec += tfrResult.avgBandwidthGbPerSec;
} }
...@@ -2390,7 +3423,7 @@ namespace { ...@@ -2390,7 +3423,7 @@ namespace {
{ {
// Replace any round brackets or '->' with spaces, // Replace any round brackets or '->' with spaces,
for (int i = 1; line[i]; i++) for (int i = 1; line[i]; i++)
if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' '; if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == ':' || line[i] == '>' ) line[i] = ' ';
transfers.clear(); transfers.clear();
...@@ -2424,17 +3457,18 @@ namespace { ...@@ -2424,17 +3457,18 @@ namespace {
transfer.numSubExecs = numSubExecs; transfer.numSubExecs = numSubExecs;
if (iss.fail()) { if (iss.fail()) {
return {ERR_FATAL, return {ERR_FATAL,
"Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet", i+1}; "Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet", i+1};
} }
transfer.numBytes = 0;
} else { } else {
iss >> srcStr >> exeStr >> dstStr >> transfer.numSubExecs >> numBytesToken; iss >> srcStr >> exeStr >> dstStr >> transfer.numSubExecs >> numBytesToken;
if (iss.fail()) { if (iss.fail()) {
return {ERR_FATAL, return {ERR_FATAL,
"Parsing error: Unable to read valid Transfer %d (SRC EXE DST $CU #Bytes) tuple", i+1}; "Parsing error: Unable to read valid Transfer %d (SRC EXE DST $CU #Bytes) tuple", i+1};
} }
if (sscanf(numBytesToken.c_str(), "%lu", &transfer.numBytes) != 1) { if (sscanf(numBytesToken.c_str(), "%lu", &transfer.numBytes) != 1) {
return {ERR_FATAL, return {ERR_FATAL,
"Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple", i+1}; "Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple", i+1};
} }
char units = numBytesToken.back(); char units = numBytesToken.back();
...@@ -2448,7 +3482,6 @@ namespace { ...@@ -2448,7 +3482,6 @@ namespace {
ERR_CHECK(ParseMemType(srcStr, transfer.srcs)); ERR_CHECK(ParseMemType(srcStr, transfer.srcs));
ERR_CHECK(ParseMemType(dstStr, transfer.dsts)); ERR_CHECK(ParseMemType(dstStr, transfer.dsts));
ERR_CHECK(ParseExeType(exeStr, transfer.exeDevice, transfer.exeSubIndex)); ERR_CHECK(ParseExeType(exeStr, transfer.exeDevice, transfer.exeSubIndex));
transfers.push_back(transfer); transfers.push_back(transfer);
} }
return ERR_NONE; return ERR_NONE;
...@@ -2466,6 +3499,12 @@ namespace { ...@@ -2466,6 +3499,12 @@ namespace {
if (status != hipSuccess) numDetectedGpus = 0; if (status != hipSuccess) numDetectedGpus = 0;
return numDetectedGpus; return numDetectedGpus;
} }
#ifdef NIC_EXEC_ENABLED
case EXE_NIC: case EXE_NIC_NEAREST:
{
return GetIbvDeviceList().size();
}
#endif
default: default:
return 0; return 0;
} }
...@@ -2572,6 +3611,102 @@ namespace { ...@@ -2572,6 +3611,102 @@ namespace {
#endif #endif
} }
int GetClosestCpuNumaToNic(int nicIndex)
{
#ifdef NIC_EXEC_ENABLED
int numNics = GetNumExecutors(EXE_NIC);
if (nicIndex < 0 || nicIndex >= numNics) return -1;
return GetIbvDeviceList()[nicIndex].numaNode;
#else
return -1;
#endif
}
int GetClosestNicToGpu(int gpuIndex)
{
#ifdef NIC_EXEC_ENABLED
static bool isInitialized = false;
static std::vector<int> closestNicId;
int numGpus = GetNumExecutors(EXE_GPU_GFX);
if (gpuIndex < 0 || gpuIndex >= numGpus) return -1;
// Build closest NICs per GPU on first use
if (!isInitialized) {
closestNicId.resize(numGpus, -1);
// Build up list of NIC bus addresses
std::vector<std::string> ibvAddressList;
auto const& ibvDeviceList = GetIbvDeviceList();
for (auto const& ibvDevice : ibvDeviceList)
ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : "");
// Track how many times a device has been assigned as "closest"
// This allows distributed work across devices using multiple ports (sharing the same busID)
// NOTE: This isn't necessarily optimal, but likely to work in most cases involving multi-port
// Counter example:
//
// G0 prefers (N0,N1), picks N0
// G1 prefers (N1,N2), picks N1
// G2 prefers N0, picks N0
//
// instead of G0->N1, G1->N2, G2->N0
std::vector<int> assignedCount(ibvDeviceList.size(), 0);
// Loop over each GPU to find the closest NIC(s) based on PCIe address
for (int i = 0; i < numGpus; i++) {
// Collect PCIe address for the GPU
char hipPciBusId[64];
hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i);
if (err != hipSuccess) {
#ifdef VERBS_DEBUG
printf("Failed to get PCI Bus ID for HIP device %d: %s\n", i, hipGetErrorString(err));
#endif
closestNicId[i] = -1;
continue;
}
// Find closest NICs
std::set<int> closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList);
// Pick the least-used NIC to assign as closest
int closestIdx = -1;
for (auto idx : closestNicIdxs) {
if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx])
closestIdx = idx;
}
// The following will only use distance between bus IDs
// to determine the closest NIC to GPU if the PCIe tree approach fails
if (closestIdx < 0) {
#ifdef VERBS_DEBUG
printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n");
#endif
int minDistance = std::numeric_limits<int>::max();
for (int j = 0; j < ibvDeviceList.size(); j++) {
if (ibvDeviceList[j].busId != "") {
int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[j].busId);
if (distance < minDistance && distance >= 0) {
minDistance = distance;
closestIdx = j;
}
}
}
}
closestNicId[i] = closestIdx;
if (closestIdx != -1) assignedCount[closestIdx]++;
}
isInitialized = true;
}
return closestNicId[gpuIndex];
#else
return -1;
#endif
}
// Undefine CUDA compatibility macros // Undefine CUDA compatibility macros
#if defined(__NVCC__) #if defined(__NVCC__)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment