Unverified Commit 5984f49e authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench V1.59 (#162)



Adding NIC execution capabilities, various bug fixes introduced by header-only-library refactor
---------
Co-authored-by: default avatarMustafa Abduljabbar <mustafa.abduljabbar@amd.com>
parent fcac6d92
......@@ -3,6 +3,30 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.59.00
### Added
- Adding in support for NIC executor, which allows for RDMA copies on NICs that support IBVerbs
By default, NIC executor will be enabled if IBVerbs is found in the dynamic linker cache
- NIC executor can be indexed in two methods
- "I" Ix.y will use NIC x as the source and NIC y as the destination.
E.g. (G0 I0.5 G4)
- "N" Nx.y will use NIC closest to GPU x as source, and NIC closest to GPU y as destination
E.g. (G0 N0.4 N4)
- The closest NIC can be overridden by the environment variable CLOSEST_NIC, which should be a comma-separated
list of NIC indices to use for the corresponding GPU
- This feature can be explicitly disabled at compile time by specifying DISABLE_NIC_EXEC=1
### Modified
- Changing default data size to 256M from 64M
- Adding NUM_QUEUE_PAIRS which enables NIC traffic in A2A. Each GPU will talk to the next GPU via the closest NIC
- Sweep preset now saves last sweep run configuration to /tmp/lastSweep.cfg and can be changed via SWEEP_FILE
### Fixed
- Fixed bug with reporting when using subiterations
- Fixed bug with per-Transfer data size specification
- Fixed bug when using XCC prefered table
## v1.58.00
### Fixed
- Fixed broken specific DMA-engine copies
......
......@@ -7,7 +7,7 @@ else()
endif()
cmake_minimum_required(VERSION 3.5)
project(TransferBench VERSION 1.58.00 LANGUAGES CXX)
project(TransferBench VERSION 1.59.00 LANGUAGES CXX)
# Default GPU architectures to build
#==================================================================================================
......@@ -56,6 +56,18 @@ set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
include_directories(${ROCM_PATH}/include)
find_library(IBVERBS_LIBRARY ibverbs)
if (IBVERBS_LIBRARY)
if (DEFINED ENV{DISABLE_NIC_EXEC})
message(STATUS "Disabling NIC Executor support")
else()
message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
add_definitions(-DNIC_EXEC_ENABLED)
link_libraries(ibverbs)
endif()
else()
message(WARNING "IBVerbs library not found. Building without NIC executor support")
endif()
link_libraries(numa hsa-runtime64 pthread)
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
add_executable(TransferBench src/client/Client.cpp)
......
......@@ -11,9 +11,9 @@ NVCC=$(CUDA_PATH)/bin/nvcc
# Compile TransferBenchCuda if nvcc detected
ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
EXE=TransferBenchCuda
EXE=TransferBenchCuda
else
EXE=TransferBench
EXE=TransferBench
endif
CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
......@@ -21,13 +21,30 @@ NVFLAGS = -x cu -lnuma -arch=native
COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
LDFLAGS += -lpthread
# Compile RDMA executor if IBVerbs is found in the Dynamic Linker cache
NIC_ENABLED = 0
ifneq ($(DISABLE_NIC_EXEC),1)
ifneq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
NIC_ENABLED = 1
endif
endif
all: $(EXE)
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
clean:
rm -f *.o ./TransferBench ./TransferBenchCuda
NicStatus:
ifeq ($(NIC_ENABLED), 1)
$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
else
$(info Building without NIC executor support)
endif
......@@ -13,6 +13,7 @@
# 1) CPU CPU thread
# 2) GPU GPU threadblock/Compute Unit (CU)
# 3) DMA N/A. (May only be used for copies (single SRC/DST)
# 4) NIC Queue Pair
# Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel
......@@ -34,9 +35,11 @@
# #SEs : Number of SubExectors to use (CPU threads/ GPU threadblocks)
# srcMemL : Source memory locations (Where the data is to be read from)
# Executor : Executor is specified by a character indicating type, followed by device index (0-indexed)
# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)
# - G: GPU-executed (Indexed from 0 to # GPUs - 1)
# - D: DMA-executor (Indexed from 0 to # GPUs - 1)
# - C: CPU-executed (Indexed from 0 to # NUMA nodes - 1)
# - G: GPU-executed (Indexed from 0 to # GPUs - 1)
# - D: DMA-executor (Indexed from 0 to # GPUs - 1)
# - I#.#: NIC executor (Indexed from 0 to # NICs - 1)
# - N#.#: Nearest NIC executor (Indexed from 0 to # GPUs - 1)
# dstMemL : Destination memory locations (Where the data is to be written to)
# bytesL : Number of bytes to copy (0 means use command-line specified size)
# Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
......@@ -56,7 +59,10 @@
# 1 4 (C1->G2->G0) Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
# 2 4 G0->G0->G1 G1->G1->G0 Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs
# 1 2 (F0->I0.2->F1) Uses 2 QPs to transfer data from GPU0 via NIC0 to GPU1 via NIC2
# 1 1 (F0->N0.1->F1) Uses 1 QP to transfer data from GPU0 via GPU0's closest NIC to GPU1 via GPU1's closest NIC
# -2 (G0->N0.1->G1 2 128M) (G1->N1.0->G0 1 256M) Uses Nearest NIC executor to copy 128Mb from GPU0 to GPU1 with 2 QPs,
# and 256Mb from GPU1 to GPU0 with 1 QP
# Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
# Lines starting with # will be ignored. Lines starting with ## will be echoed to output
......
......@@ -121,13 +121,23 @@ int main(int argc, char **argv) {
}
}
// Track which transfers have already numBytes specified
std::vector<bool> bytesSpecified(transfers.size());
int hasUnspecified = false;
for (int i = 0; i < transfers.size(); i++) {
bytesSpecified[i] = (transfers[i].numBytes != 0);
if (transfers[i].numBytes == 0) hasUnspecified = true;
}
// Run the specified numbers of bytes otherwise generate a range of values
for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
do {
for (auto& t : transfers)
t.numBytes = currBytes;
for (int i = 0; i < transfers.size(); i++) {
if (!bytesSpecified[i])
transfers[i].numBytes = currBytes;
}
if (maxVarCount == 0) {
if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
......@@ -162,17 +172,21 @@ int main(int argc, char **argv) {
PrintResults(ev, ++testNum, bestTransfers, bestResults);
PrintErrors(bestResults.errResults);
}
if (numBytesPerTransfer != 0) break;
if (numBytesPerTransfer != 0 || !hasUnspecified) break;
currBytes += deltaBytes;
} while (currBytes < bytes * 2);
if (numBytesPerTransfer != 0) break;
if (numBytesPerTransfer != 0 || !hasUnspecified) break;
}
}
}
void DisplayUsage(char const* cmdName)
{
printf("TransferBench v%s.%s\n", TransferBench::VERSION, CLIENT_VERSION);
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
printf("TransferBench v%s.%s%s\n", TransferBench::VERSION, CLIENT_VERSION, nicSupport.c_str());
printf("========================================\n");
if (numa_available() == -1) {
......@@ -218,7 +232,7 @@ void PrintResults(EnvVars const& ev, int const testNum,
ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex;
printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
printf(" Executor: %3s %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);
......@@ -230,14 +244,15 @@ void PrintResults(EnvVars const& ev, int const testNum,
char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1)
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
printf(" Transfer %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
printf(" Transfer %02d %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %c%03d%s:%03d -> %s\n",
idx, sep,
r.avgBandwidthGbPerSec, sep,
r.avgDurationMsec, sep,
r.numBytes, sep,
MemDevicesToStr(t.srcs).c_str(), ExeTypeName[exeType], exeIndex,
exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
MemDevicesToStr(t.srcs).c_str(),
TransferBench::ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
// Show per-iteration timing information
if (ev.showIterations) {
......@@ -269,7 +284,7 @@ void PrintResults(EnvVars const& ev, int const testNum,
for (auto& time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
printf(" Iter %03d %c %8.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
std::set<int> usedXccs;
if (time.second - 1 < r.perIterCUs.size()) {
......@@ -285,11 +300,11 @@ void PrintResults(EnvVars const& ev, int const testNum,
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
printf(" StandardDev %c %8.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
}
}
}
printf(" Aggregate (CPU) %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
printf(" Aggregate (CPU) %c %8.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
sep, results.avgTotalBandwidthGbPerSec,
sep, results.avgTotalDurationMsec,
sep, results.totalBytesTransferred,
......
......@@ -28,9 +28,9 @@ THE SOFTWARE.
#include "TransferBench.hpp"
#include "EnvVars.hpp"
size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);
size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<28);
char const ExeTypeName[4][4] = {"CPU", "GPU", "DMA", "IBV"};
char const ExeTypeName[5][4] = {"CPU", "GPU", "DMA", "NIC", "NIC"};
// Display detected hardware
void DisplayTopology(bool outputToCsv);
......
......@@ -100,6 +100,14 @@ public:
int outputToCsv; // Output in CSV format
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
// NIC options
int ibGidIndex; // GID Index for RoCE NICs
int roceVersion; // RoCE version number
int ipAddressFamily; // IP Address Famliy
uint8_t ibPort; // NIC port number to be used
int nicRelaxedOrder; // Use relaxed ordering for RDMA
std::string closestNicStr; // Holds the user-specified list of closest NICs
// Developer features
int gpuMaxHwQueues; // Tracks GPU_MAX_HW_QUEUES environment variable
......@@ -147,8 +155,16 @@ public:
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
validateSource = GetEnvVar("VALIDATE_SOURCE" , 0);
ibGidIndex = GetEnvVar("IB_GID_INDEX" ,-1);
ibPort = GetEnvVar("IB_PORT_NUMBER" , 1);
roceVersion = GetEnvVar("ROCE_VERSION" , 2);
ipAddressFamily = GetEnvVar("IP_ADDRESS_FAMILY" , 4);
nicRelaxedOrder = GetEnvVar("NIC_RELAX_ORDER" , 1);
closestNicStr = GetEnvVar("CLOSEST_NIC" , "");
gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES" , 4);
// Check for fill pattern
char* pattern = getenv("FILL_PATTERN");
if (pattern != NULL) {
......@@ -279,18 +295,32 @@ public:
printf(" BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
printf(" BLOCK_BYTES - Controls granularity of how work is divided across subExecutors\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4\n");
#if NIC_EXEC_ENABLED
printf(" CLOSEST_NIC - Comma-separated list of per-GPU closest NIC (default=auto)\n");
#endif
printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
#if NIC_EXEC_ENABLED
printf(" IB_GID_INDEX - Required for RoCE NICs (default=-1/auto)\n");
printf(" IB_PORT_NUMBER - RDMA port count for RDMA NIC (default=1)\n");
printf(" IP_ADDRESS_FAMILY - IP address family (4=v4, 6=v6, default=v4)\n");
#endif
printf(" MIN_VAR_SUBEXEC - Minumum # of subexecutors to use for variable subExec Transfers\n");
printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
#if NIC_EXEC_ENABLED
printf(" NIC_RELAX_ORDER - Set to non-zero to use relaxed ordering");
#endif
printf(" NUM_ITERATIONS - # of timed iterations per test. If negative, run for this many seconds instead\n");
printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
printf(" NUM_WARMUPS - # of untimed warmup iterations per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
#if NIC_EXEC_ENABLED
printf(" ROCE_VERSION - RoCE version (default=2)\n");
#endif
printf(" SAMPLING_FACTOR - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info\n");
printf(" USE_HIP_EVENTS - Use HIP events for GFX executor timing\n");
......@@ -301,6 +331,7 @@ public:
printf(" VALIDATE_SOURCE - Validate GPU src memory immediately after preparation\n");
}
void Print(std::string const& name, int32_t const value, const char* format, ...) const
{
printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : ");
......@@ -325,9 +356,12 @@ public:
void DisplayEnvVars() const
{
int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
std::string nicSupport = "";
#if NIC_EXEC_ENABLED
nicSupport = " (with NIC support)";
#endif
if (!outputToCsv) {
printf("TransferBench v%s.%s\n", TransferBench::VERSION, CLIENT_VERSION);
printf("TransferBench v%s.%s%s\n", TransferBench::VERSION, CLIENT_VERSION, nicSupport.c_str());
printf("===============================================================\n");
if (!hideEnv) printf("[Common] (Suppress by setting HIDE_ENV=1)\n");
}
......@@ -341,6 +375,10 @@ public:
"Each CU gets a mulitple of %d bytes to copy", blockBytes);
Print("BYTE_OFFSET", byteOffset,
"Using byte offset of %d", byteOffset);
#if NIC_EXEC_ENABLED
Print("CLOSEST_NIC", (closestNicStr == "" ? "auto" : "user-input"),
"Per-GPU closest NIC is set as %s", (closestNicStr == "" ? "auto" : closestNicStr.c_str()));
#endif
Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
"%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
......@@ -359,11 +397,24 @@ public:
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll"));
#if NIC_EXEC_ENABLED
Print("IP_ADDRESS_FAMILY", ipAddressFamily,
"IP address family is set to IPv%d", ipAddressFamily);
Print("IB_GID_INDEX", ibGidIndex,
"RoCE GID index is set to %s", (ibGidIndex < 0 ? "auto" : std::to_string(ibGidIndex).c_str()));
Print("IB_PORT_NUMBER", ibPort,
"IB port number is set to %d", ibPort);
#endif
Print("MIN_VAR_SUBEXEC", minNumVarSubExec,
"Using at least %d subexecutor(s) for variable subExec tranfers", minNumVarSubExec);
Print("MAX_VAR_SUBEXEC", maxNumVarSubExec,
"Using up to %s subexecutors for variable subExec transfers",
maxNumVarSubExec ? std::to_string(maxNumVarSubExec).c_str() : "all available");
#if NIC_EXEC_ENABLED
Print("NIC_RELAX_ORDER", nicRelaxedOrder,
"Using %s ordering for NIC RDMA", nicRelaxedOrder ? "relaxed" : "strict");
#endif
Print("NUM_ITERATIONS", numIterations,
(numIterations == 0) ? "Running infinitely" :
"Running %d %s", abs(numIterations), (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
......@@ -371,6 +422,10 @@ public:
"Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str());
Print("NUM_WARMUPS", numWarmups,
"Running %d warmup iteration(s) per Test", numWarmups);
#if NIC_EXEC_ENABLED
Print("ROCE_VERSION", roceVersion,
"RoCE version is set to %d", roceVersion);
#endif
Print("SHOW_ITERATIONS", showIterations,
"%s per-iteration timing", showIterations ? "Showing" : "Hiding");
Print("USE_HIP_EVENTS", useHipEvents,
......@@ -381,7 +436,6 @@ public:
"Running in %s mode", useInteractive ? "interactive" : "non-interactive");
Print("USE_SINGLE_STREAM", useSingleStream,
"Using single stream per GFX %s", useSingleStream ? "device" : "Transfer");
if (getenv("XCC_PREF_TABLE")) {
printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
printf("%36s: ", "");
......@@ -479,6 +533,27 @@ public:
cfg.gfx.useSingleTeam = gfxSingleTeam;
cfg.gfx.waveOrder = gfxWaveOrder;
cfg.nic.ibGidIndex = ibGidIndex;
cfg.nic.ibPort = ibPort;
cfg.nic.ipAddressFamily = ipAddressFamily;
cfg.nic.useRelaxedOrder = nicRelaxedOrder;
cfg.nic.roceVersion = roceVersion;
std::vector<int> closestNics;
if(closestNicStr != "") {
std::stringstream ss(closestNicStr);
std::string item;
while (std::getline(ss, item, ',')) {
try {
int nic = std::stoi(item);
closestNics.push_back(nic);
} catch (const std::invalid_argument& e) {
printf("[ERROR] Invalid NIC index (%s) by user in %s\n", item.c_str(), closestNicStr.c_str());
exit(1);
}
}
cfg.nic.closestNics = closestNics;
}
return cfg;
}
};
......
......@@ -47,6 +47,7 @@ void AllToAllPreset(EnvVars& ev,
int a2aLocal = EnvVars::GetEnvVar("A2A_LOCAL" , 0);
int a2aMode = EnvVars::GetEnvVar("A2A_MODE" , 0);
int numGpus = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
int numSubExecs = EnvVars::GetEnvVar("NUM_SUB_EXEC" , 8);
int useDmaExec = EnvVars::GetEnvVar("USE_DMA_EXEC" , 0);
int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
......@@ -60,6 +61,7 @@ void AllToAllPreset(EnvVars& ev,
ev.Print("A2A_LOCAL" , a2aLocal , "%s local transfers", a2aLocal ? "Include" : "Exclude");
ev.Print("A2A_MODE" , a2aMode , a2aModeStr[a2aMode]);
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
ev.Print("NUM_SUB_EXEC" , numSubExecs , "Using %d subexecutors/CUs per Transfer", numSubExecs);
ev.Print("USE_DMA_EXEC" , useDmaExec , "Using %s executor", useDmaExec ? "DMA" : "GFX");
ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
......@@ -114,6 +116,23 @@ void AllToAllPreset(EnvVars& ev,
}
}
// Create a ring using NICs
std::vector<int> nicTransferIdx(numGpus);
if (numQueuePairs > 0) {
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
for (int i = 0; i < numGpus; i++) {
TransferBench::Transfer transfer;
transfer.numBytes = numBytesPerTransfer;
transfer.srcs.push_back({memType, i});
transfer.dsts.push_back({memType, (i+1) % numGpus});
transfer.exeDevice = {TransferBench::EXE_NIC_NEAREST, i};
transfer.exeSubIndex = (i+1) % numGpus;
transfer.numSubExecs = numQueuePairs;
nicTransferIdx[i] = transfers.size();
transfers.push_back(transfer);
}
}
printf("GPU-GFX All-To-All benchmark:\n");
printf("==========================\n");
printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
......@@ -138,15 +157,18 @@ void AllToAllPreset(EnvVars& ev,
printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst);
if (numQueuePairs > 0)
printf("%cNIC(%02d QP)", separator, numQueuePairs);
printf(" %cSTotal %cActual\n", separator, separator);
double totalBandwidthGpu = 0.0;
double minExecutorBandwidth = std::numeric_limits<double>::max();
double maxExecutorBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
double minActualBandwidth = std::numeric_limits<double>::max();
double maxActualBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
for (int src = 0; src < numGpus; src++) {
double rowTotalBandwidth = 0;
double executorBandwidth = 0;
int transferCount = 0;
double minBandwidth = std::numeric_limits<double>::max();
printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) {
if (reIndex.count(std::make_pair(src, dst))) {
......@@ -155,24 +177,38 @@ void AllToAllPreset(EnvVars& ev,
colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
executorBandwidth = std::max(executorBandwidth,
results.exeResults[transfers[transferIdx].exeDevice].avgBandwidthGbPerSec);
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
} else {
printf("%c%8s ", separator, "N/A");
}
}
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
colTotalBandwidth[numGpus] += rowTotalBandwidth;
if (numQueuePairs > 0) {
TransferBench::TransferResult const& r = results.tfrResults[nicTransferIdx[src]];
colTotalBandwidth[numGpus] += r.avgBandwidthGbPerSec;
rowTotalBandwidth += r.avgBandwidthGbPerSec;
totalBandwidthGpu += r.avgBandwidthGbPerSec;
minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
transferCount++;
printf("%c%8.3f ", separator, r.avgBandwidthGbPerSec);
}
double actualBandwidth = minBandwidth * transferCount;
printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
}
printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) {
printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus],
separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
if (numQueuePairs > 0) {
printf("%c%8.3f ", separator, colTotalBandwidth[numGpus]);
}
printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
separator, minActualBandwidth, separator, maxActualBandwidth);
printf("\n");
printf("Average bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
......
......@@ -22,19 +22,21 @@ THE SOFTWARE.
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
{
fprintf(fp, "# Test %d\n", testNum);
fprintf(fp, "%d", -1 * (int)transfers.size());
for (auto const& transfer : transfers)
{
fprintf(fp, " (%s->%c%d->%s %d %lu)",
MemDevicesToStr(transfer.srcs).c_str(),
ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex,
MemDevicesToStr(transfer.dsts).c_str(),
transfer.numSubExecs,
transfer.numBytes);
if (fp) {
fprintf(fp, "# Test %d\n", testNum);
fprintf(fp, "%d", -1 * (int)transfers.size());
for (auto const& transfer : transfers)
{
fprintf(fp, " (%s->%c%d->%s %d %lu)",
MemDevicesToStr(transfer.srcs).c_str(),
ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex,
MemDevicesToStr(transfer.dsts).c_str(),
transfer.numSubExecs,
transfer.numBytes);
}
fprintf(fp, "\n");
fflush(fp);
}
fprintf(fp, "\n");
fflush(fp);
}
void SweepPreset(EnvVars& ev,
......@@ -54,6 +56,7 @@ void SweepPreset(EnvVars& ev,
int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE" , 4);
std::string sweepDst = EnvVars::GetEnvVar("SWEEP_DST" , "CG");
std::string sweepExe = EnvVars::GetEnvVar("SWEEP_EXE" , "CDG");
std::string sweepFile = EnvVars::GetEnvVar("SWEEP_FILE" , "/tmp/lastSweep.cfg");
int sweepMax = EnvVars::GetEnvVar("SWEEP_MAX" , 24);
int sweepMin = EnvVars::GetEnvVar("SWEEP_MIN" , 1);
int sweepRandBytes = EnvVars::GetEnvVar("SWEEP_RAND_BYTES" , 0);
......@@ -78,6 +81,7 @@ void SweepPreset(EnvVars& ev,
ev.Print("NUM_GPU_SE", numGpuSubExecs, "Using %d subExecutors/CUs per GPU executed Transfer", numGpuSubExecs);
ev.Print("SWEEP_DST", sweepDst.c_str(), "Destination Memory Types to sweep");
ev.Print("SWEEP_EXE", sweepExe.c_str(), "Executor Types to sweep");
ev.Print("SWEEP_FILE", sweepFile.c_str(),"File to store the executing sweep configuration");
ev.Print("SWEEP_MAX", sweepMax, "Max simultaneous transfers (0 = no limit)");
ev.Print("SWEEP_MIN", sweepMin, "Min simultaenous transfers");
ev.Print("SWEEP_RAND_BYTES", sweepRandBytes, "Using %s number of bytes per Transfer", (sweepRandBytes ? "random" : "constant"));
......@@ -283,10 +287,14 @@ void SweepPreset(EnvVars& ev,
std::uniform_int_distribution<int> distribution(sweepMin, maxParallelTransfers);
// Log sweep to configuration file
FILE *fp = fopen("lastSweep.cfg", "w");
char absPath[1024];
auto const res = realpath(sweepFile.c_str(), absPath);
FILE *fp = fopen(sweepFile.c_str(), "w");
if (!fp) {
printf("[ERROR] Unable to open lastSweep.cfg. Check permissions\n");
exit(1);
printf("[WARN] Unable to open %s. Skipping output of sweep configuration file\n", res ? absPath : sweepFile.c_str());
} else {
printf("Sweep configuration saved to: %s\n", res ? absPath : sweepFile.c_str());
}
// Create bitmask of numPossible triplets, of which M will be chosen
......@@ -333,7 +341,7 @@ void SweepPreset(EnvVars& ev,
// Check for test limit
if (numTestsRun == sweepTestLimit) {
printf("Test limit reached\n");
printf("Sweep Test limit reached\n");
break;
}
......@@ -341,7 +349,7 @@ void SweepPreset(EnvVars& ev,
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
if (sweepTimeLimit && totalCpuTime > sweepTimeLimit) {
printf("Time limit exceeded\n");
printf("Sweep Time limit exceeded\n");
break;
}
......@@ -357,5 +365,5 @@ void SweepPreset(EnvVars& ev,
bitmask[i] = (i < M) ? 1 : 0;
}
}
fclose(fp);
if (fp) fclose(fp);
}
......@@ -38,21 +38,53 @@ static int RemappedCpuIndex(int origIdx)
return remappingCpu[origIdx];
}
static void PrintNicToGPUTopo(bool outputToCsv)
{
#ifdef NIC_EXEC_ENABLED
printf(" NIC | Device Name | Active | PCIe Bus ID | NUMA | Closest GPU(s)\n");
if(!outputToCsv)
printf("-----+-------------+--------+--------------+------+---------------\n");
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
auto const& ibvDeviceList = GetIbvDeviceList();
for (int i = 0; i < ibvDeviceList.size(); i++) {
std::string closestGpusStr = "";
for (int j = 0; j < numGpus; j++) {
if (TransferBench::GetClosestNicToGpu(j) == i) {
if (closestGpusStr != "") closestGpusStr += ",";
closestGpusStr += std::to_string(j);
}
}
printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-20s\n",
i, ibvDeviceList[i].name.c_str(),
ibvDeviceList[i].hasActivePort ? "Yes" : "No",
ibvDeviceList[i].busId.c_str(),
ibvDeviceList[i].numaNode,
closestGpusStr.c_str());
}
printf("\n");
#endif
}
void DisplayTopology(bool outputToCsv)
{
int numCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
int numNics = TransferBench::GetNumExecutors(EXE_NIC);
char sep = (outputToCsv ? ',' : '|');
if (outputToCsv) {
printf("NumCpus,%d\n", numCpus);
printf("NumGpus,%d\n", numGpus);
printf("NumNics,%d\n", numNics);
} else {
printf("\nDetected Topology:\n");
printf("==================\n");
printf(" %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1);
printf(" %d GPU device(s)\n", numGpus);
printf(" %d Supported NIC device(s)\n", numNics);
}
// Print out detected CPU topology
......@@ -91,8 +123,10 @@ void DisplayTopology(bool outputToCsv)
}
printf("\n");
// Print out detected GPU topology
// Print out detected NIC topology
PrintNicToGPUTopo(outputToCsv);
// Print out detected GPU topology
#if defined(__NVCC__)
for (int i = 0; i < numGpus; i++) {
hipDeviceProp_t prop;
......@@ -118,12 +152,12 @@ void DisplayTopology(bool outputToCsv)
printf(" %c", sep);
for (int j = 0; j < numGpus; j++)
printf(" GPU %02d %c", j, sep);
printf(" PCIe Bus ID %c #CUs %c NUMA %c #DMA %c #XCC\n", sep, sep, sep, sep);
printf(" PCIe Bus ID %c #CUs %c NUMA %c #DMA %c #XCC %c NIC\n", sep, sep, sep, sep, sep);
if (!outputToCsv) {
for (int j = 0; j <= numGpus; j++)
printf("--------+");
printf("--------------+------+------+------+------\n");
printf("--------------+------+------+------+------+------\n");
}
// Loop over each GPU device
......@@ -149,12 +183,13 @@ void DisplayTopology(bool outputToCsv)
char pciBusId[20];
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
printf(" %11s %c %4d %c %4d %c %4d %c %4d\n",
printf(" %-11s %c %-4d %c %-4d %c %-4d %c %-4d %c %-4d\n",
pciBusId, sep,
TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}), sep,
TransferBench::GetClosestCpuNumaToGpu(i), sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}), sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}));
TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}), sep,
TransferBench::GetClosestNicToGpu(i));
}
#endif
}
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment