Unverified Commit 9ab74205 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Adding SHOW_ITERATIONS to provide additional per-iteration timing info (#50)

parent b0e6ccaf
# Changelog for TransferBench # Changelog for TransferBench
## v1.26
### Added
- Setting SHOW_ITERATIONS=1 provides additional information about per-iteration timing for file and p2p configs
- For file configs, iterations are sorted from min to max bandwidth and displayed with standard deviation
- For p2p, min/max/standard deviation is shown for each direction.
### Changed
- P2P benchmark formatting changed. Now reports bidirectional bandwidth in each direction (as well as sum) for clarity
## v1.25 ## v1.25
### Fixed ### Fixed
- Fixed bug in P2P bidirectional benchmark using incorrect number of subExecutors for CPU<->GPU tests - Fixed bug in P2P bidirectional benchmark using incorrect number of subExecutors for CPU<->GPU tests
......
...@@ -445,6 +445,33 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -445,6 +445,33 @@ void ExecuteTransfers(EnvVars const& ev,
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex,
transfer->numSubExecs, transfer->numSubExecs,
transfer->DstToStr().c_str()); transfer->DstToStr().c_str());
if (ev.showIterations)
{
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++)
{
times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
double const varTime = fabs(transferDurationMsec - transfer->perIterationTime[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - transferBandwidthGbs);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
for (auto t : times)
{
double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |\n", t.second, iterBandwidthGbs, iterDurationMsec);
}
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
}
} }
else else
{ {
...@@ -488,6 +515,33 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -488,6 +515,33 @@ void ExecuteTransfers(EnvVars const& ev,
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex,
transfer->numSubExecs, transfer->numSubExecs,
transfer->DstToStr().c_str()); transfer->DstToStr().c_str());
if (ev.showIterations)
{
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++)
{
times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
double const varTime = fabs(transferDurationMsec - transfer->perIterationTime[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - transferBandwidthGbs);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
for (auto t : times)
{
double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |\n", t.second, iterBandwidthGbs, iterDurationMsec);
}
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
}
} }
else else
{ {
...@@ -1184,12 +1238,16 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1184,12 +1238,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
int const wallClockRate = GetWallClockRate(exeIndex); int const wallClockRate = GetWallClockRate(exeIndex);
double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate); double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
currTransfer->transferTime += iterationTimeMs; currTransfer->transferTime += iterationTimeMs;
if (ev.showIterations)
currTransfer->perIterationTime.push_back(iterationTimeMs);
} }
exeInfo.totalTime += gpuDeltaMsec; exeInfo.totalTime += gpuDeltaMsec;
} }
else else
{ {
transfer->transferTime += gpuDeltaMsec; transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations)
transfer->perIterationTime.push_back(gpuDeltaMsec);
} }
} }
} }
...@@ -1224,6 +1282,8 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1224,6 +1282,8 @@ void RunTransfer(EnvVars const& ev, int const iteration,
float gpuDeltaMsec; float gpuDeltaMsec;
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
transfer->transferTime += gpuDeltaMsec; transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations)
transfer->perIterationTime.push_back(gpuDeltaMsec);
} }
} }
else if (transfer->exeType == EXE_CPU) // CPU execution agent else if (transfer->exeType == EXE_CPU) // CPU execution agent
...@@ -1252,7 +1312,12 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1252,7 +1312,12 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// Record time if not a warmup iteration // Record time if not a warmup iteration
if (iteration >= 0) if (iteration >= 0)
transfer->transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0); {
double const delta = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
transfer->transferTime += delta;
if (ev.showIterations)
transfer->perIterationTime.push_back(delta);
}
} }
} }
...@@ -1260,6 +1325,9 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1260,6 +1325,9 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
{ {
ev.DisplayP2PBenchmarkEnvVars(); ev.DisplayP2PBenchmarkEnvVars();
char const separator = ev.outputToCsv ? ',' : ' ';
printf("Bytes Per Direction%c%lu\n", separator, N * sizeof(float));
// Collect the number of available CPUs/GPUs on this machine // Collect the number of available CPUs/GPUs on this machine
int const numCpus = ev.numCpuDevices; int const numCpus = ev.numCpuDevices;
int const numGpus = ev.numGpuDevices; int const numGpus = ev.numGpuDevices;
...@@ -1273,29 +1341,37 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1273,29 +1341,37 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
// Perform unidirectional / bidirectional // Perform unidirectional / bidirectional
for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
{ {
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
ev.useRemoteRead ? "Remote" : "Local",
ev.useRemoteRead ? "Local" : "Remote",
ev.useDmaCopy ? "DMA" : "GFX");
// Print header // Print header
if (!ev.outputToCsv) if (isBidirectional)
{ {
printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni", printf("%12s", "SRC\\DST");
ev.useRemoteRead ? "Remote" : "Local", }
ev.useRemoteRead ? "Local" : "Remote", else
ev.useDmaCopy ? "DMA" : "GFX"); {
if (ev.useRemoteRead)
if (isBidirectional) printf("%12s", "SRC\\EXE+DST");
{
printf("%12s", "SRC\\DST");
}
else else
{ printf("%12s", "SRC+EXE\\DST");
if (ev.useRemoteRead) }
printf("%12s", "SRC\\EXE+DST"); if (ev.outputToCsv) printf(",");
else for (int i = 0; i < numCpus; i++)
printf("%12s", "SRC+EXE\\DST"); {
} printf("%7s %02d", "CPU", i);
for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i); if (ev.outputToCsv) printf(",");
for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i); }
printf("\n"); for (int i = 0; i < numGpus; i++)
{
printf("%7s %02d", "GPU", i);
if (ev.outputToCsv) printf(",");
} }
printf("\n");
ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
// Loop over all possible src/dst pairs // Loop over all possible src/dst pairs
for (int src = 0; src < numDevices; src++) for (int src = 0; src < numDevices; src++)
...@@ -1303,38 +1379,193 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1303,38 +1379,193 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
MemType const srcType = (src < numCpus ? MEM_CPU : MEM_GPU); MemType const srcType = (src < numCpus ? MEM_CPU : MEM_GPU);
int const srcIndex = (srcType == MEM_CPU ? src : src - numCpus); int const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
if (!ev.outputToCsv) std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex); std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
std::vector<std::vector<double>> stdDev(isBidirectional + 1);
for (int dst = 0; dst < numDevices; dst++) for (int dst = 0; dst < numDevices; dst++)
{ {
MemType const dstType = (dst < numCpus ? MEM_CPU : MEM_GPU); MemType const dstType = (dst < numCpus ? MEM_CPU : MEM_GPU);
int const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus); int const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex); // Prepare Transfers
if (!ev.outputToCsv) std::vector<Transfer> transfers(isBidirectional + 1);
// SRC -> DST
transfers[0].numBytes = N * sizeof(float);
transfers[0].srcType.push_back(srcType);
transfers[0].dstType.push_back(dstType);
transfers[0].srcIndex.push_back(srcIndex);
transfers[0].dstIndex.push_back(dstIndex);
transfers[0].numSrcs = transfers[0].numDsts = 1;
transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
// DST -> SRC
if (isBidirectional)
{
transfers[1].numBytes = N * sizeof(float);
transfers[1].numSrcs = transfers[1].numDsts = 1;
transfers[1].srcType.push_back(dstType);
transfers[1].dstType.push_back(srcType);
transfers[1].srcIndex.push_back(dstIndex);
transfers[1].dstIndex.push_back(srcIndex);
transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
transfers[1].numSubExecs = IsGpuType(transfers[1].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
}
bool skipTest = false;
// Abort if executing on NUMA node with no CPUs
for (int i = 0; i <= isBidirectional; i++)
{ {
if (bandwidth == 0) if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
{
skipTest = true;
break;
}
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
{
skipTest = true;
break;
}
#endif
}
if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;
if (!skipTest)
{
ExecuteTransfers(ev, 0, N, transfers, false);
for (int dir = 0; dir <= isBidirectional; dir++)
{
double const avgTime = transfers[dir].transferTime / ev.numIterations;
double const avgBw = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f;
avgBandwidth[dir].push_back(avgBw);
if (ev.showIterations)
{
double minTime = transfers[dir].perIterationTime[0];
double maxTime = transfers[dir].perIterationTime[0];
double varSum = 0;
for (int i = 0; i < transfers[dir].perIterationTime.size(); i++)
{
minTime = std::min(minTime, transfers[dir].perIterationTime[i]);
maxTime = std::max(maxTime, transfers[dir].perIterationTime[i]);
double const bw = (transfers[dir].numBytesActual / 1.0E9) / transfers[dir].perIterationTime[i] * 1000.0f;
double const delta = (avgBw - bw);
varSum += delta * delta;
}
double const minBw = (transfers[dir].numBytesActual / 1.0E9) / maxTime * 1000.0f;
double const maxBw = (transfers[dir].numBytesActual / 1.0E9) / minTime * 1000.0f;
double const stdev = sqrt(varSum / transfers[dir].perIterationTime.size());
minBandwidth[dir].push_back(minBw);
maxBandwidth[dir].push_back(maxBw);
stdDev[dir].push_back(stdev);
}
}
}
else
{
for (int dir = 0; dir <= isBidirectional; dir++)
{
avgBandwidth[dir].push_back(0);
minBandwidth[dir].push_back(0);
maxBandwidth[dir].push_back(0);
stdDev[dir].push_back(-1.0);
}
}
}
for (int dir = 0; dir <= isBidirectional; dir++)
{
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
if (ev.outputToCsv) printf(",");
for (int dst = 0; dst < numDevices; dst++)
{
double const avgBw = avgBandwidth[dir][dst];
if (avgBw == 0.0)
printf("%10s", "N/A"); printf("%10s", "N/A");
else else
printf("%10.2f", bandwidth); printf("%10.2f", avgBw);
if (ev.outputToCsv) printf(",");
} }
else printf("\n");
if (ev.showIterations)
{ {
printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n", // minBw
srcType == MEM_CPU ? "CPU" : "GPU", srcIndex, printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
dstType == MEM_CPU ? "CPU" : "GPU", dstIndex, if (ev.outputToCsv) printf(",");
isBidirectional ? "bidirectional" : "unidirectional",
ev.useRemoteRead ? "Remote" : "Local", for (int i = 0; i < numDevices; i++)
ev.useDmaCopy ? "DMA" : "GFX", {
bandwidth, double const minBw = minBandwidth[dir][i];
N * sizeof(float)); if (minBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", minBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
// maxBw
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++)
{
double const maxBw = maxBandwidth[dir][i];
if (maxBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", maxBw);
if (ev.outputToCsv) printf(",");
}
printf("\n");
// stddev
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
if (ev.outputToCsv) printf(",");
for (int i = 0; i < numDevices; i++)
{
double const sd = stdDev[dir][i];
if (sd == -1.0)
printf("%10s", "N/A");
else
printf("%10.2f", sd);
if (ev.outputToCsv) printf(",");
}
printf("\n");
} }
fflush(stdout); fflush(stdout);
} }
if (!ev.outputToCsv) printf("\n");
if (isBidirectional)
{
printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
if (ev.outputToCsv) printf(",");
for (int dst = 0; dst < numDevices; dst++)
{
double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
if (sumBw == 0.0)
printf("%10s", "N/A");
else
printf("%10.2f", sumBw);
if (ev.outputToCsv) printf(",");
}
if (src < numDevices - 1) printf("\n\n");
}
} }
if (!ev.outputToCsv) printf("\n"); printf("\n");
} }
} }
...@@ -1475,70 +1706,6 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i ...@@ -1475,70 +1706,6 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
printf("Aggregate bandwidth (CPU Timed): %7.2f\n", totalBandwidthCpu); printf("Aggregate bandwidth (CPU Timed): %7.2f\n", totalBandwidthCpu);
} }
double GetPeakBandwidth(EnvVars const& ev, size_t const N,
int const isBidirectional,
MemType const srcType, int const srcIndex,
MemType const dstType, int const dstIndex)
{
// Skip bidirectional on same device
if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
// Prepare Transfers
std::vector<Transfer> transfers(2);
transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
// SRC -> DST
transfers[0].numSrcs = transfers[0].numDsts = 1;
transfers[0].srcType.push_back(srcType);
transfers[0].dstType.push_back(dstType);
transfers[0].srcIndex.push_back(srcIndex);
transfers[0].dstIndex.push_back(dstIndex);
// DST -> SRC
transfers[1].numSrcs = transfers[1].numDsts = 1;
transfers[1].srcType.push_back(dstType);
transfers[1].dstType.push_back(srcType);
transfers[1].srcIndex.push_back(dstIndex);
transfers[1].dstIndex.push_back(srcIndex);
// Either perform (local read + remote write), or (remote read + local write)
ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
transfers[1].numSubExecs = IsGpuType(transfers[1].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
// Remove (DST->SRC) if not bidirectional
transfers.resize(isBidirectional + 1);
// Abort if executing on NUMA node with no CPUs
for (int i = 0; i <= isBidirectional; i++)
{
if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
return 0;
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
return 0;
#endif
}
ExecuteTransfers(ev, 0, N, transfers, false);
// Collect aggregate bandwidth
double totalBandwidth = 0;
for (int i = 0; i <= isBidirectional; i++)
{
double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations);
double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
totalBandwidth += transferBandwidthGbs;
}
return totalBandwidth;
}
void Transfer::PrepareSubExecParams(EnvVars const& ev) void Transfer::PrepareSubExecParams(EnvVars const& ev)
{ {
// Each subExecutor needs to know src/dst pointers and how many elements to transfer // Each subExecutor needs to know src/dst pointers and how many elements to transfer
...@@ -1582,6 +1749,7 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev) ...@@ -1582,6 +1749,7 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
} }
this->transferTime = 0.0; this->transferTime = 0.0;
this->perIterationTime.clear();
} }
void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx) void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.25" #define TB_VERSION "1.26"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -75,6 +75,7 @@ public: ...@@ -75,6 +75,7 @@ public:
int outputToCsv; // Output in CSV format int outputToCsv; // Output in CSV format
int samplingFactor; // Affects how many different values of N are generated (when N set to 0) int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
int sharedMemBytes; // Amount of shared memory to use per threadblock int sharedMemBytes; // Amount of shared memory to use per threadblock
int showIterations; // Show per-iteration timing info
int useInteractive; // Pause for user-input before starting transfer loop int useInteractive; // Pause for user-input before starting transfer loop
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern) int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
...@@ -155,6 +156,7 @@ public: ...@@ -155,6 +156,7 @@ public:
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0); outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR); samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , defaultSharedMemBytes); sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , defaultSharedMemBytes);
showIterations = GetEnvVar("SHOW_ITERATIONS" , 0);
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0); useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0); usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0);
...@@ -164,10 +166,10 @@ public: ...@@ -164,10 +166,10 @@ public:
gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel); gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel);
// P2P Benchmark related // P2P Benchmark related
useRemoteRead = GetEnvVar("USE_REMOTE_READ" , 0); useRemoteRead = GetEnvVar("USE_REMOTE_READ" , 0);
useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0);
numGpuSubExecs = GetEnvVar("NUM_GPU_SE" , useDmaCopy ? 1 : numDeviceCUs); numGpuSubExecs = GetEnvVar("NUM_GPU_SE" , useDmaCopy ? 1 : numDeviceCUs);
numCpuSubExecs = GetEnvVar("NUM_CPU_SE" , DEFAULT_P2P_NUM_CPU_SE); numCpuSubExecs = GetEnvVar("NUM_CPU_SE" , DEFAULT_P2P_NUM_CPU_SE);
// Sweep related // Sweep related
sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN); sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN);
...@@ -382,6 +384,7 @@ public: ...@@ -382,6 +384,7 @@ public:
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n"); printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n"); printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n"); printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n"); printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n");
...@@ -429,6 +432,8 @@ public: ...@@ -429,6 +432,8 @@ public:
std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test")); std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes, PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock")); std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
PRINT_EV("SHOW_ITERATIONS", showIterations,
std::string(showIterations ? "Showing" : "Hiding") + " per-iteration timing");
PRINT_EV("USE_INTERACTIVE", useInteractive, PRINT_EV("USE_INTERACTIVE", useInteractive,
std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode"); std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
PRINT_EV("USE_PCIE_INDEX", usePcieIndexing, PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
......
...@@ -119,6 +119,8 @@ struct Transfer ...@@ -119,6 +119,8 @@ struct Transfer
std::vector<SubExecParam> subExecParam; // Defines subarrays assigned to each threadblock std::vector<SubExecParam> subExecParam; // Defines subarrays assigned to each threadblock
SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
std::vector<double> perIterationTime; // Per-iteration timing
// Prepares src/dst subarray pointers for each SubExecutor // Prepares src/dst subarray pointers for each SubExecutor
void PrepareSubExecParams(EnvVars const& ev); void PrepareSubExecParams(EnvVars const& ev);
...@@ -187,12 +189,6 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co ...@@ -187,12 +189,6 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom); void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs); void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
// Return the maximum bandwidth measured for given (src/dst) pair
double GetPeakBandwidth(EnvVars const& ev, size_t const N,
int const isBidirectional,
MemType const srcType, int const srcIndex,
MemType const dstType, int const dstIndex);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount); std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
int RemappedIndex(int const origIdx, bool const isCpuType); int RemappedIndex(int const origIdx, bool const isCpuType);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment