Unverified Commit 3d284dd7 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.27 CU Masking, cmdline preset, CUDA fixes (#53)

parent 7608befb
# Changelog for TransferBench # Changelog for TransferBench
## v1.27
### Added
- Adding cmdline preset to allow specify simple tests on command line
- E.g. ./TransferBench cmdline 64M "1 4 G0->G0->G1"
- Adding environment variable HIDE_ENV, which skips printing of environment variable values
- Adding environment variable CU_MASK, which allows selection of which CUs to execute on
- CU_MASK is specified in CU indices (0-#CUs-1), and '-' can be used to denote ranges of values
- E.g.: CU_MASK=3-8,16 would request Transfer be executed only CUs 3,4,5,6,7,8,16
- NOTE: This is somewhat experimental and may not work on all hardware
- SHOW_ITERATIONS now shows CU usage for that iteration (experimental)
### Modified
- Adding extra comments on commonly missing includes with details on how to install them
### Fixed
- CUDA compilation should work again (wall_clock64 CUDA alias was not defined)
## v1.26 ## v1.26
### Added ### Added
- Setting SHOW_ITERATIONS=1 provides additional information about per-iteration timing for file and p2p configs - Setting SHOW_ITERATIONS=1 provides additional information about per-iteration timing for file and p2p configs
......
...@@ -22,7 +22,8 @@ THE SOFTWARE. ...@@ -22,7 +22,8 @@ THE SOFTWARE.
// This program measures simultaneous copy performance across multiple GPUs // This program measures simultaneous copy performance across multiple GPUs
// on the same node // on the same node
#include <numa.h> #include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
#include <cmath> // If not found, try installing g++-12 (e.g apt-get install g++-12)
#include <numaif.h> #include <numaif.h>
#include <random> #include <random>
#include <stack> #include <stack>
...@@ -115,6 +116,48 @@ int main(int argc, char **argv) ...@@ -115,6 +116,48 @@ int main(int argc, char **argv)
RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs); RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
exit(0); exit(0);
} }
else if (!strcmp(argv[1], "cmdline"))
{
// Print environment variables and CSV header
ev.DisplayEnvVars();
if (ev.outputToCsv)
{
printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
}
// Read Transfer from command line
std::string cmdlineTransfer;
for (int i = 3; i < argc; i++)
cmdlineTransfer += std::string(argv[i]) + " ";
char line[2048];
sprintf(line, "%s", cmdlineTransfer.c_str());
std::vector<Transfer> transfers;
ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
if (transfers.empty()) exit(0);
// If the number of bytes is specified, use it
if (numBytesPerTransfer != 0)
{
size_t N = numBytesPerTransfer / sizeof(float);
ExecuteTransfers(ev, 1, N, transfers);
}
else
{
// Otherwise generate a range of values
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = N;
while (curr < N * 2)
{
ExecuteTransfers(ev, 1, curr, transfers);
curr += delta;
}
}
}
exit(0);
}
// Check that Transfer configuration file can be opened // Check that Transfer configuration file can be opened
ev.configMode = CFG_FILE; ev.configMode = CFG_FILE;
...@@ -251,8 +294,17 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -251,8 +294,17 @@ void ExecuteTransfers(EnvVars const& ev,
exeInfo.startEvents.resize(numStreamsToUse); exeInfo.startEvents.resize(numStreamsToUse);
exeInfo.stopEvents.resize(numStreamsToUse); exeInfo.stopEvents.resize(numStreamsToUse);
for (int i = 0; i < numStreamsToUse; ++i) for (int i = 0; i < numStreamsToUse; ++i)
{
if (ev.cuMask.size())
{
#if !defined(__NVCC__)
HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data()));
#endif
}
else
{ {
HIP_CALL(hipStreamCreate(&exeInfo.streams[i])); HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
}
HIP_CALL(hipEventCreate(&exeInfo.startEvents[i])); HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i])); HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
} }
...@@ -261,8 +313,13 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -261,8 +313,13 @@ void ExecuteTransfers(EnvVars const& ev,
{ {
// Allocate one contiguous chunk of GPU memory for threadblock parameters // Allocate one contiguous chunk of GPU memory for threadblock parameters
// This allows support for executing one transfer per stream, or all transfers in a single stream // This allows support for executing one transfer per stream, or all transfers in a single stream
#if !defined(__NVCC__)
AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam), AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
(void**)&exeInfo.subExecParamGpu); (void**)&exeInfo.subExecParamGpu);
#else
AllocateMemory(MEM_CPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
(void**)&exeInfo.subExecParamGpu);
#endif
} }
} }
} }
...@@ -538,7 +595,10 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -538,7 +595,10 @@ void ExecuteTransfers(EnvVars const& ev,
{ {
double iterDurationMsec = t.first; double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f; double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |\n", t.second, iterBandwidthGbs, iterDurationMsec); printf(" Iter %03d | %7.3f GB/s | %8.3f ms | CUs:", t.second, iterBandwidthGbs, iterDurationMsec);
for (auto x : transfer->perIterationCUs[t.second - 1])
printf(" %2d", x);
printf("\n");
} }
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime); printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
} }
...@@ -608,7 +668,11 @@ cleanup: ...@@ -608,7 +668,11 @@ cleanup:
if (exeType == EXE_GPU_GFX) if (exeType == EXE_GPU_GFX)
{ {
#if !defined(__NVCC__)
DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu); DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
#else
DeallocateMemory(MEM_CPU, exeInfo.subExecParamGpu);
#endif
} }
} }
} }
...@@ -641,6 +705,7 @@ void DisplayUsage(char const* cmdName) ...@@ -641,6 +705,7 @@ void DisplayUsage(char const* cmdName)
printf(" - 4rd optional arg: GPU index to use as executor\n"); printf(" - 4rd optional arg: GPU index to use as executor\n");
printf(" a2a - GPU All-To-All benchmark\n"); printf(" a2a - GPU All-To-All benchmark\n");
printf(" - 3rd optional arg: # of SubExecs to use\n"); printf(" - 3rd optional arg: # of SubExecs to use\n");
printf(" cmdline - Read Transfers from command line arguments (after N)\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n"); printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
DEFAULT_BYTES_PER_TRANSFER); DEFAULT_BYTES_PER_TRANSFER);
...@@ -1182,6 +1247,15 @@ void CheckPages(char* array, size_t numBytes, int targetId) ...@@ -1182,6 +1247,15 @@ void CheckPages(char* array, size_t numBytes, int targetId)
} }
} }
uint32_t GetId(uint32_t hwId)
{
// Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
int const shId = (hwId >> 12) & 1;
int const cuId = (hwId >> 8) & 7;
int const seId = (hwId >> 13) & 3;
return (shId << 5) + (cuId << 2) + seId;
}
void RunTransfer(EnvVars const& ev, int const iteration, void RunTransfer(EnvVars const& ev, int const iteration,
ExecutorInfo& exeInfo, int const transferIdx) ExecutorInfo& exeInfo, int const transferIdx)
{ {
...@@ -1239,7 +1313,13 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1239,7 +1313,13 @@ void RunTransfer(EnvVars const& ev, int const iteration,
double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate); double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
currTransfer->transferTime += iterationTimeMs; currTransfer->transferTime += iterationTimeMs;
if (ev.showIterations) if (ev.showIterations)
{
currTransfer->perIterationTime.push_back(iterationTimeMs); currTransfer->perIterationTime.push_back(iterationTimeMs);
std::set<int> CUs;
for (int i = 0; i < currTransfer->numSubExecs; i++)
CUs.insert(GetId(currTransfer->subExecParamGpuPtr[i].hwId));
currTransfer->perIterationCUs.push_back(CUs);
}
} }
exeInfo.totalTime += gpuDeltaMsec; exeInfo.totalTime += gpuDeltaMsec;
} }
...@@ -1247,7 +1327,13 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1247,7 +1327,13 @@ void RunTransfer(EnvVars const& ev, int const iteration,
{ {
transfer->transferTime += gpuDeltaMsec; transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations) if (ev.showIterations)
{
transfer->perIterationTime.push_back(gpuDeltaMsec); transfer->perIterationTime.push_back(gpuDeltaMsec);
std::set<int> CUs;
for (int i = 0; i < transfer->numSubExecs; i++)
CUs.insert(GetId(transfer->subExecParamGpuPtr[i].hwId));
transfer->perIterationCUs.push_back(CUs);
}
} }
} }
} }
...@@ -1341,6 +1427,9 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1341,6 +1427,9 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
// Perform unidirectional / bidirectional // Perform unidirectional / bidirectional
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
{ {
if (ev.p2pMode == 1 && isBidirectional == 1 ||
ev.p2pMode == 2 && isBidirectional == 0) continue;
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni", printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
ev.useRemoteRead ? "Remote" : "Local", ev.useRemoteRead ? "Remote" : "Local",
ev.useRemoteRead ? "Local" : "Remote", ev.useRemoteRead ? "Local" : "Remote",
...@@ -1372,7 +1461,6 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1372,7 +1461,6 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
printf("\n"); printf("\n");
ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX; ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
// Loop over all possible src/dst pairs // Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++) for (int src = 0; src < numDevices; src++)
{ {
...@@ -1506,7 +1594,6 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1506,7 +1594,6 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
// minBw // minBw
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min"); printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
if (ev.outputToCsv) printf(","); if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++) for (int i = 0; i < numDevices; i++)
{ {
double const minBw = minBandwidth[dir][i]; double const minBw = minBandwidth[dir][i];
......
...@@ -27,7 +27,7 @@ THE SOFTWARE. ...@@ -27,7 +27,7 @@ THE SOFTWARE.
#include <cuda_runtime.h> #include <cuda_runtime.h>
// ROCm specific // ROCm specific
#define __builtin_amdgcn_s_memrealtime clock64 #define wall_clock64 clock64
#define gcnArchName name #define gcnArchName name
// Datatypes // Datatypes
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.26" #define TB_VERSION "1.27"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -68,6 +68,7 @@ public: ...@@ -68,6 +68,7 @@ public:
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
int byteOffset; // Byte-offset for memory allocations int byteOffset; // Byte-offset for memory allocations
int continueOnError; // Continue tests even after mismatch detected int continueOnError; // Continue tests even after mismatch detected
int hideEnv; // Skip printing environment variable
int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected) int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected)
int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected) int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected)
int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead
...@@ -83,12 +84,14 @@ public: ...@@ -83,12 +84,14 @@ public:
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
std::vector<float> fillPattern; // Pattern of floats used to fill source data std::vector<float> fillPattern; // Pattern of floats used to fill source data
std::vector<uint32_t> cuMask; // Bit-vector representing the CU mask
// Environment variables only for Benchmark-preset // Environment variables only for P2P preset
int useRemoteRead; // Use destination memory type as executor instead of source memory type
int useDmaCopy; // Use DMA copy instead of GPU copy
int numGpuSubExecs; // Number of GPU subexecutors to use
int numCpuSubExecs; // Number of CPU subexecttors to use int numCpuSubExecs; // Number of CPU subexecttors to use
int numGpuSubExecs; // Number of GPU subexecutors to use
int p2pMode; // Both = 0, Unidirectional = 1, Bidirectional = 2
int useDmaCopy; // Use DMA copy instead of GPU copy
int useRemoteRead; // Use destination memory type as executor instead of source memory type
// Environment variables only for Sweep-preset // Environment variables only for Sweep-preset
int sweepMin; // Min number of simultaneous Transfers to be executed per test int sweepMin; // Min number of simultaneous Transfers to be executed per test
...@@ -149,6 +152,7 @@ public: ...@@ -149,6 +152,7 @@ public:
blockBytes = GetEnvVar("BLOCK_BYTES" , 256); blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0); byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0); continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0);
numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus); numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus);
numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus); numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS); numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
...@@ -170,6 +174,7 @@ public: ...@@ -170,6 +174,7 @@ public:
useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0);
numGpuSubExecs = GetEnvVar("NUM_GPU_SE" , useDmaCopy ? 1 : numDeviceCUs); numGpuSubExecs = GetEnvVar("NUM_GPU_SE" , useDmaCopy ? 1 : numDeviceCUs);
numCpuSubExecs = GetEnvVar("NUM_CPU_SE" , DEFAULT_P2P_NUM_CPU_SE); numCpuSubExecs = GetEnvVar("NUM_CPU_SE" , DEFAULT_P2P_NUM_CPU_SE);
p2pMode = GetEnvVar("P2P_MODE" , 0);
// Sweep related // Sweep related
sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN); sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN);
...@@ -252,6 +257,49 @@ public: ...@@ -252,6 +257,49 @@ public:
} }
else fillPattern.clear(); else fillPattern.clear();
// Check for CU mask
cuMask.clear();
char* cuMaskStr = getenv("CU_MASK");
if (cuMaskStr != NULL)
{
#if defined(__NVCC__)
printf("[WARN] CU_MASK is not supported in CUDA\n");
#else
std::vector<std::pair<int, int>> ranges;
int maxCU = 0;
char* token = strtok(cuMaskStr, ",");
while (token)
{
int start, end;
if (sscanf(token, "%d-%d", &start, &end) == 2)
{
ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
maxCU = std::max(maxCU, std::max(start, end));
}
else if (sscanf(token, "%d", &start) == 1)
{
ranges.push_back(std::make_pair(start, start));
maxCU = std::max(maxCU, start);
}
else
{
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
cuMask.resize(maxCU / 32 + 1, 0);
for (auto range : ranges)
{
for (int i = range.first; i <= range.second; i++)
{
cuMask[i / 32] |= (1 << (i % 32));
}
}
#endif
}
// Perform some basic validation // Perform some basic validation
if (numCpuDevices > numDetectedCpus) if (numCpuDevices > numDetectedCpus)
{ {
...@@ -376,7 +424,9 @@ public: ...@@ -376,7 +424,9 @@ public:
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n"); printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n"); printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n"); printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n");
printf(" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n"); printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n"); printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n"); printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n"); printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
...@@ -406,10 +456,11 @@ public: ...@@ -406,10 +456,11 @@ public:
{ {
printf("TransferBench v%s\n", TB_VERSION); printf("TransferBench v%s\n", TB_VERSION);
printf("=====================================================\n"); printf("=====================================================\n");
printf("[Common]\n"); if (!hideEnv) printf("[Common]\n");
} }
else else if (!hideEnv)
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION); printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
if (hideEnv) return;
PRINT_EV("BLOCK_BYTES", blockBytes, PRINT_EV("BLOCK_BYTES", blockBytes,
std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy")); std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
...@@ -417,6 +468,8 @@ public: ...@@ -417,6 +468,8 @@ public:
std::string("Using byte offset of " + std::to_string(byteOffset))); std::string("Using byte offset of " + std::to_string(byteOffset)));
PRINT_EV("CONTINUE_ON_ERROR", continueOnError, PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error")); std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
PRINT_EV("CU_MASK", getenv("CU_MASK") ? 1 : 0,
(cuMask.size() ? GetCuMaskDesc() : "All"));
PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0, PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
(fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString())); (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
PRINT_EV("GPU_KERNEL", gpuKernel, PRINT_EV("GPU_KERNEL", gpuKernel,
...@@ -451,6 +504,7 @@ public: ...@@ -451,6 +504,7 @@ public:
void DisplayP2PBenchmarkEnvVars() const void DisplayP2PBenchmarkEnvVars() const
{ {
DisplayEnvVars(); DisplayEnvVars();
if (hideEnv) return;
if (!outputToCsv) if (!outputToCsv)
printf("[P2P Related]\n"); printf("[P2P Related]\n");
...@@ -459,6 +513,10 @@ public: ...@@ -459,6 +513,10 @@ public:
std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors"); std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
PRINT_EV("NUM_GPU_SE", numGpuSubExecs, PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors"); std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
PRINT_EV("P2P_MODE", p2pMode,
std::string("Running ") + (p2pMode == 1 ? "Unidirectional" :
p2pMode == 2 ? "Bidirectional" :
"Unidirectional + Bidirectional"));
PRINT_EV("USE_GPU_DMA", useDmaCopy, PRINT_EV("USE_GPU_DMA", useDmaCopy,
std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor"); std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
PRINT_EV("USE_REMOTE_READ", useRemoteRead, PRINT_EV("USE_REMOTE_READ", useRemoteRead,
...@@ -470,6 +528,7 @@ public: ...@@ -470,6 +528,7 @@ public:
void DisplaySweepEnvVars() const void DisplaySweepEnvVars() const
{ {
DisplayEnvVars(); DisplayEnvVars();
if (hideEnv) return;
if (!outputToCsv) if (!outputToCsv)
printf("[Sweep Related]\n"); printf("[Sweep Related]\n");
...@@ -512,6 +571,50 @@ public: ...@@ -512,6 +571,50 @@ public:
return getenv(varname.c_str()); return getenv(varname.c_str());
return defaultValue; return defaultValue;
} }
std::string GetCuMaskDesc() const
{
std::vector<std::pair<int, int>> runs;
bool inRun = false;
std::pair<int, int> curr;
int used = 0;
for (int i = 0; i < cuMask.size(); i++)
{
for (int j = 0; j < 32; j++)
{
if (cuMask[i] & (1 << j))
{
used++;
if (!inRun)
{
inRun = true;
curr.first = i * 32 + j;
}
}
else
{
if (inRun)
{
inRun = false;
curr.second = i * 32 + j - 1;
runs.push_back(curr);
}
}
}
}
if (inRun)
curr.second = cuMask.size() * 32 - 1;
std::string result = "CUs used: (" + std::to_string(used) + ") ";
for (int i = 0; i < runs.size(); i++)
{
if (i) result += ",";
if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
}
return result;
}
}; };
#endif #endif
...@@ -41,8 +41,18 @@ struct SubExecParam ...@@ -41,8 +41,18 @@ struct SubExecParam
float* dst[MAX_DSTS]; // Destination array pointers float* dst[MAX_DSTS]; // Destination array pointers
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor) long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor) long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor)
uint32_t hwId; // Hardware ID
}; };
// Macro for collecting HW_REG_HW_ID
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__)
#define __trace_hwreg() \
p.hwId = 0
#else
#define __trace_hwreg() \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (p.hwId));
#endif
void CpuReduceKernel(SubExecParam const& p) void CpuReduceKernel(SubExecParam const& p)
{ {
int const& numSrcs = p.numSrcs; int const& numSrcs = p.numSrcs;
...@@ -211,6 +221,7 @@ GpuReduceKernel(SubExecParam* params) ...@@ -211,6 +221,7 @@ GpuReduceKernel(SubExecParam* params)
{ {
p.startCycle = startCycle; p.startCycle = startCycle;
p.stopCycle = wall_clock64(); p.stopCycle = wall_clock64();
__trace_hwreg();
} }
} }
......
...@@ -81,7 +81,7 @@ char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"}; ...@@ -81,7 +81,7 @@ char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
MemType inline CharToMemType(char const c) MemType inline CharToMemType(char const c)
{ {
char const* val = strchr(MemTypeStr, toupper(c)); char const* val = strchr(MemTypeStr, toupper(c));
if (*val) return (MemType)(val - MemTypeStr); if (val) return (MemType)(val - MemTypeStr);
printf("[ERROR] Unexpected memory type (%c)\n", c); printf("[ERROR] Unexpected memory type (%c)\n", c);
exit(1); exit(1);
} }
...@@ -89,7 +89,7 @@ MemType inline CharToMemType(char const c) ...@@ -89,7 +89,7 @@ MemType inline CharToMemType(char const c)
ExeType inline CharToExeType(char const c) ExeType inline CharToExeType(char const c)
{ {
char const* val = strchr(ExeTypeStr, toupper(c)); char const* val = strchr(ExeTypeStr, toupper(c));
if (*val) return (ExeType)(val - ExeTypeStr); if (val) return (ExeType)(val - ExeTypeStr);
printf("[ERROR] Unexpected executor type (%c)\n", c); printf("[ERROR] Unexpected executor type (%c)\n", c);
exit(1); exit(1);
} }
...@@ -120,6 +120,7 @@ struct Transfer ...@@ -120,6 +120,7 @@ struct Transfer
SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
std::vector<double> perIterationTime; // Per-iteration timing std::vector<double> perIterationTime; // Per-iteration timing
std::vector<std::set<int>> perIterationCUs; // Per-iteration CU usage
// Prepares src/dst subarray pointers for each SubExecutor // Prepares src/dst subarray pointers for each SubExecutor
void PrepareSubExecParams(EnvVars const& ev); void PrepareSubExecParams(EnvVars const& ev);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment