Unverified Commit 30f1c584 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.43 Minor changes to a2a output (#76)

parent 8b2cd85d
...@@ -3,6 +3,11 @@ ...@@ -3,6 +3,11 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.43
### Changes
* Modifying a2a to show executor timing, as well as executor min/max bandwidth
## v1.42 ## v1.42
### Fixes ### Fixes
......
...@@ -581,7 +581,8 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -581,7 +581,8 @@ void ExecuteTransfers(EnvVars const& ev,
for (auto const& transfer : exeInfo.transfers) for (auto const& transfer : exeInfo.transfers)
{ {
transfer->transferTime /= (1.0 * numTimedIterations); transfer->transferTime /= (1.0 * numTimedIterations);
double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f; transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = exeBandwidthGbs;
totalCUs += transfer->numSubExecs; totalCUs += transfer->numSubExecs;
if (!verbose) continue; if (!verbose) continue;
...@@ -589,7 +590,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -589,7 +590,7 @@ void ExecuteTransfers(EnvVars const& ev,
{ {
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
transfer->transferIndex, transfer->transferIndex,
transferBandwidthGbs, transfer->transferBandwidth,
transfer->transferTime, transfer->transferTime,
transfer->numBytesActual, transfer->numBytesActual,
transfer->SrcToStr().c_str(), transfer->SrcToStr().c_str(),
...@@ -609,7 +610,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -609,7 +610,7 @@ void ExecuteTransfers(EnvVars const& ev,
stdDevTime += varTime * varTime; stdDevTime += varTime * varTime;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f; double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - transferBandwidthGbs); double const varBw = fabs(iterBandwidthGbs - transfer->transferBandwidth);
stdDevBw += varBw * varBw; stdDevBw += varBw * varBw;
} }
stdDevTime = sqrt(stdDevTime / numTimedIterations); stdDevTime = sqrt(stdDevTime / numTimedIterations);
...@@ -647,7 +648,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -647,7 +648,7 @@ void ExecuteTransfers(EnvVars const& ev,
MemTypeStr[transfer->exeType], transfer->exeIndex, MemTypeStr[transfer->exeType], transfer->exeIndex,
transfer->DstToStr().c_str(), transfer->DstToStr().c_str(),
transfer->numSubExecs, transfer->numSubExecs,
transferBandwidthGbs, transfer->transferTime, transfer->transferBandwidth, transfer->transferTime,
PtrVectorToStr(transfer->srcMem, initOffset).c_str(), PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
PtrVectorToStr(transfer->dstMem, initOffset).c_str()); PtrVectorToStr(transfer->dstMem, initOffset).c_str());
} }
...@@ -668,14 +669,15 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -668,14 +669,15 @@ void ExecuteTransfers(EnvVars const& ev,
{ {
Transfer* transfer = transferPair.second; Transfer* transfer = transferPair.second;
transfer->transferTime /= (1.0 * numTimedIterations); transfer->transferTime /= (1.0 * numTimedIterations);
double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f; transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = transfer->transferBandwidth;
maxGpuTime = std::max(maxGpuTime, transfer->transferTime); maxGpuTime = std::max(maxGpuTime, transfer->transferTime);
if (!verbose) continue; if (!verbose) continue;
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
transfer->transferIndex, transfer->transferIndex,
transferBandwidthGbs, transfer->transferTime, transfer->transferBandwidth, transfer->transferTime,
transfer->numBytesActual, transfer->numBytesActual,
transfer->SrcToStr().c_str(), transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex,
...@@ -694,7 +696,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -694,7 +696,7 @@ void ExecuteTransfers(EnvVars const& ev,
stdDevTime += varTime * varTime; stdDevTime += varTime * varTime;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f; double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - transferBandwidthGbs); double const varBw = fabs(iterBandwidthGbs - transfer->transferBandwidth);
stdDevBw += varBw * varBw; stdDevBw += varBw * varBw;
} }
stdDevTime = sqrt(stdDevTime / numTimedIterations); stdDevTime = sqrt(stdDevTime / numTimedIterations);
...@@ -731,7 +733,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -731,7 +733,7 @@ void ExecuteTransfers(EnvVars const& ev,
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex,
transfer->DstToStr().c_str(), transfer->DstToStr().c_str(),
transfer->numSubExecs, transfer->numSubExecs,
transferBandwidthGbs, transfer->transferTime, transfer->transferBandwidth, transfer->transferTime,
PtrVectorToStr(transfer->srcMem, initOffset).c_str(), PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
PtrVectorToStr(transfer->dstMem, initOffset).c_str()); PtrVectorToStr(transfer->dstMem, initOffset).c_str());
} }
...@@ -1849,20 +1851,21 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co ...@@ -1849,20 +1851,21 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
char separator = (ev.outputToCsv ? ',' : ' '); char separator = (ev.outputToCsv ? ',' : ' ');
std::vector<Transfer> transfers(1); std::vector<Transfer> transfers(1);
transfers[0].numBytes = N * sizeof(float); Transfer& t = transfers[0];
transfers[0].numSrcs = 1; t.numBytes = N * sizeof(float);
transfers[0].numDsts = 1; t.numSrcs = 1;
transfers[0].exeType = EXE_GPU_GFX; t.numDsts = 1;
transfers[0].exeIndex = exeIndex; t.exeType = EXE_GPU_GFX;
transfers[0].exeSubIndex = -1; t.exeIndex = exeIndex;
transfers[0].srcType.resize(1, MEM_GPU); t.exeSubIndex = -1;
transfers[0].dstType.resize(1, MEM_GPU); t.srcType.resize(1, MEM_GPU);
transfers[0].srcIndex.resize(1); t.dstType.resize(1, MEM_GPU);
transfers[0].dstIndex.resize(1); t.srcIndex.resize(1);
t.dstIndex.resize(1);
printf("GPU-GFX Scaling benchmark:\n"); printf("GPU-GFX Scaling benchmark:\n");
printf("==========================\n"); printf("==========================\n");
printf("- Copying %lu bytes from GPU %d to other devices\n", transfers[0].numBytes, exeIndex); printf("- Copying %lu bytes from GPU %d to other devices\n", t.numBytes, exeIndex);
printf("- All numbers reported as GB/sec\n\n"); printf("- All numbers reported as GB/sec\n\n");
printf("NumCUs"); printf("NumCUs");
...@@ -1873,21 +1876,20 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co ...@@ -1873,21 +1876,20 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
std::vector<std::pair<double, int>> bestResult(numDevices); std::vector<std::pair<double, int>> bestResult(numDevices);
for (int numSubExec = 1; numSubExec <= maxSubExecs; numSubExec++) for (int numSubExec = 1; numSubExec <= maxSubExecs; numSubExec++)
{ {
transfers[0].numSubExecs = numSubExec; t.numSubExecs = numSubExec;
printf("%4d ", numSubExec); printf("%4d ", numSubExec);
for (int i = 0; i < numDevices; i++) for (int i = 0; i < numDevices; i++)
{ {
transfers[0].dstType[0] = i < numCpus ? MEM_CPU : MEM_GPU; t.dstType[0] = i < numCpus ? MEM_CPU : MEM_GPU;
transfers[0].dstIndex[0] = i < numCpus ? i : i - numCpus; t.dstIndex[0] = i < numCpus ? i : i - numCpus;
ExecuteTransfers(ev, 0, N, transfers, false); ExecuteTransfers(ev, 0, N, transfers, false);
double transferBandwidthGbs = (transfers[0].numBytesActual / 1.0E9) / transfers[0].transferTime * 1000.0f; printf("%c%7.2f ", separator, t.transferBandwidth);
printf("%c%7.2f ", separator, transferBandwidthGbs);
if (transferBandwidthGbs > bestResult[i].first) if (t.transferBandwidth > bestResult[i].first)
{ {
bestResult[i].first = transferBandwidthGbs; bestResult[i].first = t.transferBandwidth;
bestResult[i].second = numSubExec; bestResult[i].second = numSubExec;
} }
} }
...@@ -1960,14 +1962,14 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i ...@@ -1960,14 +1962,14 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
if (transfers.size() == 0) return; if (transfers.size() == 0) return;
double totalBandwidthCpu = 0; double totalBandwidthCpu = 0;
ExecuteTransfers(ev, 0, numBytesPerTransfer / sizeof(float), transfers, true, &totalBandwidthCpu); ExecuteTransfers(ev, 0, numBytesPerTransfer / sizeof(float), transfers, !ev.hideEnv, &totalBandwidthCpu);
printf("\nSummary:\n"); printf("\nSummary:\n");
printf("==========================================================\n"); printf("==========================================================\n");
printf("SRC\\DST"); printf("SRC\\DST ");
for (int dst = 0; dst < numGpus; dst++) for (int dst = 0; dst < numGpus; dst++)
printf("%cGPU %02d ", separator, dst); printf("%cGPU %02d ", separator, dst);
printf(" %cSTotal\n", separator); printf(" %cSTotal %cActual\n", separator, separator);
std::map<std::pair<int, int>, int> reIndex; std::map<std::pair<int, int>, int> reIndex;
for (int i = 0; i < transfers.size(); i++) for (int i = 0; i < transfers.size(); i++)
...@@ -1977,41 +1979,47 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i ...@@ -1977,41 +1979,47 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
} }
double totalBandwidthGpu = 0.0; double totalBandwidthGpu = 0.0;
double minExecutorBandwidth = std::numeric_limits<double>::max();
double maxExecutorBandwidth = 0.0;
std::vector<double> colTotalBandwidth(numGpus+1, 0.0); std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
for (int src = 0; src < numGpus; src++) for (int src = 0; src < numGpus; src++)
{ {
double rowTotalBandwidth = 0; double rowTotalBandwidth = 0;
double executorBandwidth = 0;
printf("GPU %02d", src); printf("GPU %02d", src);
for (int dst = 0; dst < numGpus; dst++) for (int dst = 0; dst < numGpus; dst++)
{ {
if (reIndex.count(std::make_pair(src, dst))) if (reIndex.count(std::make_pair(src, dst)))
{ {
Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]]; Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]];
double transferBandwidthGbs = (transfer.numBytesActual / 1.0E9) / transfer.transferTime * 1000.0f; colTotalBandwidth[dst] += transfer.transferBandwidth;
colTotalBandwidth[dst] += transferBandwidthGbs; rowTotalBandwidth += transfer.transferBandwidth;
rowTotalBandwidth += transferBandwidthGbs; totalBandwidthGpu += transfer.transferBandwidth;
totalBandwidthGpu += transferBandwidthGbs; executorBandwidth = std::max(executorBandwidth, transfer.executorBandwidth);
printf("%c%7.2f ", separator, transferBandwidthGbs); printf("%c%8.3f ", separator, transfer.transferBandwidth);
} }
else else
{ {
printf("%c%7s ", separator, "N/A"); printf("%c%8s ", separator, "N/A");
} }
} }
printf(" %c%7.2f\n", separator, rowTotalBandwidth); printf(" %c%8.3f %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
colTotalBandwidth[numGpus] += rowTotalBandwidth; colTotalBandwidth[numGpus] += rowTotalBandwidth;
} }
printf("\nRTotal"); printf("\nRTotal");
for (int dst = 0; dst < numGpus; dst++) for (int dst = 0; dst < numGpus; dst++)
{ {
printf("%c%7.2f ", separator, colTotalBandwidth[dst]); printf("%c%8.3f ", separator, colTotalBandwidth[dst]);
} }
printf(" %c%7.2f\n", separator, colTotalBandwidth[numGpus]); printf(" %c%8.3f %c%8.3f %c%8.3f\n", separator, colTotalBandwidth[numGpus],
separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
printf("\n"); printf("\n");
printf("Average bandwidth (GPU Timed): %7.2f GB/s\n", totalBandwidthGpu / transfers.size()); printf("Average bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
printf("Aggregate bandwidth (GPU Timed): %7.2f GB/s\n", totalBandwidthGpu); printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
printf("Aggregate bandwidth (CPU Timed): %7.2f GB/s\n", totalBandwidthCpu); printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", totalBandwidthCpu);
} }
void Transfer::PrepareSubExecParams(EnvVars const& ev) void Transfer::PrepareSubExecParams(EnvVars const& ev)
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.42" #define TB_VERSION "1.43"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
......
...@@ -115,7 +115,9 @@ struct Transfer ...@@ -115,7 +115,9 @@ struct Transfer
// Outputs // Outputs
size_t numBytesActual; // Actual number of bytes to copy size_t numBytesActual; // Actual number of bytes to copy
double transferTime; // Time taken in milliseconds double transferTime; // Time taken in milliseconds for this transfer
double transferBandwidth; // Transfer bandwidth (GB/s)
double executorBandwidth; // Executor bandwidth (GB/s)
std::vector<double> perIterationTime; // Per-iteration timing std::vector<double> perIterationTime; // Per-iteration timing
std::vector<std::set<std::pair<int,int>>> perIterationCUs; // Per-iteration CU usage std::vector<std::set<std::pair<int,int>>> perIterationCUs; // Per-iteration CU usage
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment