Unverified Commit 3d284dd7 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.27 CU Masking, cmdline preset, CUDA fixes (#53)

parent 7608befb
# Changelog for TransferBench
## v1.27
### Added
- Adding cmdline preset to allow specify simple tests on command line
- E.g. ./TransferBench cmdline 64M "1 4 G0->G0->G1"
- Adding environment variable HIDE_ENV, which skips printing of environment variable values
- Adding environment variable CU_MASK, which allows selection of which CUs to execute on
- CU_MASK is specified in CU indices (0-#CUs-1), and '-' can be used to denote ranges of values
- E.g.: CU_MASK=3-8,16 would request Transfer be executed only CUs 3,4,5,6,7,8,16
- NOTE: This is somewhat experimental and may not work on all hardware
- SHOW_ITERATIONS now shows CU usage for that iteration (experimental)
### Modified
- Adding extra comments on commonly missing includes with details on how to install them
### Fixed
- CUDA compilation should work again (wall_clock64 CUDA alias was not defined)
## v1.26
### Added
- Setting SHOW_ITERATIONS=1 provides additional information about per-iteration timing for file and p2p configs
......
......@@ -22,7 +22,8 @@ THE SOFTWARE.
// This program measures simultaneous copy performance across multiple GPUs
// on the same node
#include <numa.h>
#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
#include <cmath> // If not found, try installing g++-12 (e.g apt-get install g++-12)
#include <numaif.h>
#include <random>
#include <stack>
......@@ -115,6 +116,48 @@ int main(int argc, char **argv)
RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
exit(0);
}
else if (!strcmp(argv[1], "cmdline"))
{
// Print environment variables and CSV header
ev.DisplayEnvVars();
if (ev.outputToCsv)
{
printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
}
// Read Transfer from command line
std::string cmdlineTransfer;
for (int i = 3; i < argc; i++)
cmdlineTransfer += std::string(argv[i]) + " ";
char line[2048];
sprintf(line, "%s", cmdlineTransfer.c_str());
std::vector<Transfer> transfers;
ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
if (transfers.empty()) exit(0);
// If the number of bytes is specified, use it
if (numBytesPerTransfer != 0)
{
size_t N = numBytesPerTransfer / sizeof(float);
ExecuteTransfers(ev, 1, N, transfers);
}
else
{
// Otherwise generate a range of values
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = N;
while (curr < N * 2)
{
ExecuteTransfers(ev, 1, curr, transfers);
curr += delta;
}
}
}
exit(0);
}
// Check that Transfer configuration file can be opened
ev.configMode = CFG_FILE;
......@@ -252,7 +295,16 @@ void ExecuteTransfers(EnvVars const& ev,
exeInfo.stopEvents.resize(numStreamsToUse);
for (int i = 0; i < numStreamsToUse; ++i)
{
HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
if (ev.cuMask.size())
{
#if !defined(__NVCC__)
HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data()));
#endif
}
else
{
HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
}
HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
}
......@@ -261,8 +313,13 @@ void ExecuteTransfers(EnvVars const& ev,
{
// Allocate one contiguous chunk of GPU memory for threadblock parameters
// This allows support for executing one transfer per stream, or all transfers in a single stream
#if !defined(__NVCC__)
AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
(void**)&exeInfo.subExecParamGpu);
#else
AllocateMemory(MEM_CPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
(void**)&exeInfo.subExecParamGpu);
#endif
}
}
}
......@@ -538,7 +595,10 @@ void ExecuteTransfers(EnvVars const& ev,
{
double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |\n", t.second, iterBandwidthGbs, iterDurationMsec);
printf(" Iter %03d | %7.3f GB/s | %8.3f ms | CUs:", t.second, iterBandwidthGbs, iterDurationMsec);
for (auto x : transfer->perIterationCUs[t.second - 1])
printf(" %2d", x);
printf("\n");
}
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
}
......@@ -608,7 +668,11 @@ cleanup:
if (exeType == EXE_GPU_GFX)
{
#if !defined(__NVCC__)
DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
#else
DeallocateMemory(MEM_CPU, exeInfo.subExecParamGpu);
#endif
}
}
}
......@@ -641,6 +705,7 @@ void DisplayUsage(char const* cmdName)
printf(" - 4rd optional arg: GPU index to use as executor\n");
printf(" a2a - GPU All-To-All benchmark\n");
printf(" - 3rd optional arg: # of SubExecs to use\n");
printf(" cmdline - Read Transfers from command line arguments (after N)\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
......@@ -1182,6 +1247,15 @@ void CheckPages(char* array, size_t numBytes, int targetId)
}
}
uint32_t GetId(uint32_t hwId)
{
// Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
int const shId = (hwId >> 12) & 1;
int const cuId = (hwId >> 8) & 7;
int const seId = (hwId >> 13) & 3;
return (shId << 5) + (cuId << 2) + seId;
}
void RunTransfer(EnvVars const& ev, int const iteration,
ExecutorInfo& exeInfo, int const transferIdx)
{
......@@ -1239,7 +1313,13 @@ void RunTransfer(EnvVars const& ev, int const iteration,
double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
currTransfer->transferTime += iterationTimeMs;
if (ev.showIterations)
{
currTransfer->perIterationTime.push_back(iterationTimeMs);
std::set<int> CUs;
for (int i = 0; i < currTransfer->numSubExecs; i++)
CUs.insert(GetId(currTransfer->subExecParamGpuPtr[i].hwId));
currTransfer->perIterationCUs.push_back(CUs);
}
}
exeInfo.totalTime += gpuDeltaMsec;
}
......@@ -1247,7 +1327,13 @@ void RunTransfer(EnvVars const& ev, int const iteration,
{
transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations)
{
transfer->perIterationTime.push_back(gpuDeltaMsec);
std::set<int> CUs;
for (int i = 0; i < transfer->numSubExecs; i++)
CUs.insert(GetId(transfer->subExecParamGpuPtr[i].hwId));
transfer->perIterationCUs.push_back(CUs);
}
}
}
}
......@@ -1341,6 +1427,9 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
// Perform unidirectional / bidirectional
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
{
if (ev.p2pMode == 1 && isBidirectional == 1 ||
ev.p2pMode == 2 && isBidirectional == 0) continue;
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
ev.useRemoteRead ? "Remote" : "Local",
ev.useRemoteRead ? "Local" : "Remote",
......@@ -1372,7 +1461,6 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
printf("\n");
ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
// Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++)
{
......@@ -1506,7 +1594,6 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
// minBw
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++)
{
double const minBw = minBandwidth[dir][i];
......
......@@ -27,7 +27,7 @@ THE SOFTWARE.
#include <cuda_runtime.h>
// ROCm specific
#define __builtin_amdgcn_s_memrealtime clock64
#define wall_clock64 clock64
#define gcnArchName name
// Datatypes
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.26"
#define TB_VERSION "1.27"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -68,6 +68,7 @@ public:
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
int byteOffset; // Byte-offset for memory allocations
int continueOnError; // Continue tests even after mismatch detected
int hideEnv; // Skip printing environment variable
int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected)
int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected)
int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead
......@@ -83,12 +84,14 @@ public:
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
std::vector<float> fillPattern; // Pattern of floats used to fill source data
std::vector<uint32_t> cuMask; // Bit-vector representing the CU mask
// Environment variables only for Benchmark-preset
int useRemoteRead; // Use destination memory type as executor instead of source memory type
int useDmaCopy; // Use DMA copy instead of GPU copy
int numGpuSubExecs; // Number of GPU subexecutors to use
// Environment variables only for P2P preset
int numCpuSubExecs; // Number of CPU subexecttors to use
int numGpuSubExecs; // Number of GPU subexecutors to use
int p2pMode; // Both = 0, Unidirectional = 1, Bidirectional = 2
int useDmaCopy; // Use DMA copy instead of GPU copy
int useRemoteRead; // Use destination memory type as executor instead of source memory type
// Environment variables only for Sweep-preset
int sweepMin; // Min number of simultaneous Transfers to be executed per test
......@@ -149,6 +152,7 @@ public:
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0);
numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus);
numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
......@@ -170,6 +174,7 @@ public:
useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0);
numGpuSubExecs = GetEnvVar("NUM_GPU_SE" , useDmaCopy ? 1 : numDeviceCUs);
numCpuSubExecs = GetEnvVar("NUM_CPU_SE" , DEFAULT_P2P_NUM_CPU_SE);
p2pMode = GetEnvVar("P2P_MODE" , 0);
// Sweep related
sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN);
......@@ -252,6 +257,49 @@ public:
}
else fillPattern.clear();
// Check for CU mask
cuMask.clear();
char* cuMaskStr = getenv("CU_MASK");
if (cuMaskStr != NULL)
{
#if defined(__NVCC__)
printf("[WARN] CU_MASK is not supported in CUDA\n");
#else
std::vector<std::pair<int, int>> ranges;
int maxCU = 0;
char* token = strtok(cuMaskStr, ",");
while (token)
{
int start, end;
if (sscanf(token, "%d-%d", &start, &end) == 2)
{
ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
maxCU = std::max(maxCU, std::max(start, end));
}
else if (sscanf(token, "%d", &start) == 1)
{
ranges.push_back(std::make_pair(start, start));
maxCU = std::max(maxCU, start);
}
else
{
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
cuMask.resize(maxCU / 32 + 1, 0);
for (auto range : ranges)
{
for (int i = range.first; i <= range.second; i++)
{
cuMask[i / 32] |= (1 << (i % 32));
}
}
#endif
}
// Perform some basic validation
if (numCpuDevices > numDetectedCpus)
{
......@@ -376,7 +424,9 @@ public:
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n");
printf(" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
......@@ -406,10 +456,11 @@ public:
{
printf("TransferBench v%s\n", TB_VERSION);
printf("=====================================================\n");
printf("[Common]\n");
if (!hideEnv) printf("[Common]\n");
}
else
else if (!hideEnv)
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
if (hideEnv) return;
PRINT_EV("BLOCK_BYTES", blockBytes,
std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
......@@ -417,6 +468,8 @@ public:
std::string("Using byte offset of " + std::to_string(byteOffset)));
PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
PRINT_EV("CU_MASK", getenv("CU_MASK") ? 1 : 0,
(cuMask.size() ? GetCuMaskDesc() : "All"));
PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
(fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
PRINT_EV("GPU_KERNEL", gpuKernel,
......@@ -451,6 +504,7 @@ public:
void DisplayP2PBenchmarkEnvVars() const
{
DisplayEnvVars();
if (hideEnv) return;
if (!outputToCsv)
printf("[P2P Related]\n");
......@@ -459,6 +513,10 @@ public:
std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
PRINT_EV("P2P_MODE", p2pMode,
std::string("Running ") + (p2pMode == 1 ? "Unidirectional" :
p2pMode == 2 ? "Bidirectional" :
"Unidirectional + Bidirectional"));
PRINT_EV("USE_GPU_DMA", useDmaCopy,
std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
PRINT_EV("USE_REMOTE_READ", useRemoteRead,
......@@ -470,6 +528,7 @@ public:
void DisplaySweepEnvVars() const
{
DisplayEnvVars();
if (hideEnv) return;
if (!outputToCsv)
printf("[Sweep Related]\n");
......@@ -512,6 +571,50 @@ public:
return getenv(varname.c_str());
return defaultValue;
}
std::string GetCuMaskDesc() const
{
std::vector<std::pair<int, int>> runs;
bool inRun = false;
std::pair<int, int> curr;
int used = 0;
for (int i = 0; i < cuMask.size(); i++)
{
for (int j = 0; j < 32; j++)
{
if (cuMask[i] & (1 << j))
{
used++;
if (!inRun)
{
inRun = true;
curr.first = i * 32 + j;
}
}
else
{
if (inRun)
{
inRun = false;
curr.second = i * 32 + j - 1;
runs.push_back(curr);
}
}
}
}
if (inRun)
curr.second = cuMask.size() * 32 - 1;
std::string result = "CUs used: (" + std::to_string(used) + ") ";
for (int i = 0; i < runs.size(); i++)
{
if (i) result += ",";
if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
}
return result;
}
};
#endif
......@@ -41,8 +41,18 @@ struct SubExecParam
float* dst[MAX_DSTS]; // Destination array pointers
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor)
uint32_t hwId; // Hardware ID
};
// Macro for collecting HW_REG_HW_ID
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__)
#define __trace_hwreg() \
p.hwId = 0
#else
#define __trace_hwreg() \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (p.hwId));
#endif
void CpuReduceKernel(SubExecParam const& p)
{
int const& numSrcs = p.numSrcs;
......@@ -211,6 +221,7 @@ GpuReduceKernel(SubExecParam* params)
{
p.startCycle = startCycle;
p.stopCycle = wall_clock64();
__trace_hwreg();
}
}
......
......@@ -81,7 +81,7 @@ char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
MemType inline CharToMemType(char const c)
{
char const* val = strchr(MemTypeStr, toupper(c));
if (*val) return (MemType)(val - MemTypeStr);
if (val) return (MemType)(val - MemTypeStr);
printf("[ERROR] Unexpected memory type (%c)\n", c);
exit(1);
}
......@@ -89,7 +89,7 @@ MemType inline CharToMemType(char const c)
ExeType inline CharToExeType(char const c)
{
char const* val = strchr(ExeTypeStr, toupper(c));
if (*val) return (ExeType)(val - ExeTypeStr);
if (val) return (ExeType)(val - ExeTypeStr);
printf("[ERROR] Unexpected executor type (%c)\n", c);
exit(1);
}
......@@ -98,28 +98,29 @@ ExeType inline CharToExeType(char const c)
// then writes the summation to each of the specified destination memory location(s)
struct Transfer
{
int transferIndex; // Transfer identifier (within a Test)
ExeType exeType; // Transfer executor type
int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU)
int numSubExecs; // Number of subExecutors to use for this Transfer
size_t numBytes; // # of bytes requested to Transfer (may be 0 to fallback to default)
size_t numBytesActual; // Actual number of bytes to copy
double transferTime; // Time taken in milliseconds
int numSrcs; // Number of sources
std::vector<MemType> srcType; // Source memory types
std::vector<int> srcIndex; // Source device indice
std::vector<float*> srcMem; // Source memory
int numDsts; // Number of destinations
std::vector<MemType> dstType; // Destination memory type
std::vector<int> dstIndex; // Destination device index
std::vector<float*> dstMem; // Destination memory
std::vector<SubExecParam> subExecParam; // Defines subarrays assigned to each threadblock
SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
std::vector<double> perIterationTime; // Per-iteration timing
int transferIndex; // Transfer identifier (within a Test)
ExeType exeType; // Transfer executor type
int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU)
int numSubExecs; // Number of subExecutors to use for this Transfer
size_t numBytes; // # of bytes requested to Transfer (may be 0 to fallback to default)
size_t numBytesActual; // Actual number of bytes to copy
double transferTime; // Time taken in milliseconds
int numSrcs; // Number of sources
std::vector<MemType> srcType; // Source memory types
std::vector<int> srcIndex; // Source device indice
std::vector<float*> srcMem; // Source memory
int numDsts; // Number of destinations
std::vector<MemType> dstType; // Destination memory type
std::vector<int> dstIndex; // Destination device index
std::vector<float*> dstMem; // Destination memory
std::vector<SubExecParam> subExecParam; // Defines subarrays assigned to each threadblock
SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
std::vector<double> perIterationTime; // Per-iteration timing
std::vector<std::set<int>> perIterationCUs; // Per-iteration CU usage
// Prepares src/dst subarray pointers for each SubExecutor
void PrepareSubExecParams(EnvVars const& ev);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment