Adding SHOW_ITERATIONS to provide additional per-iteration timing info (#50)

9ab74205 · gilbertlee-amd · GitHub · b0e6ccaf · 9ab74205 · 9ab74205
Unverified Commit 9ab74205 authored Aug 14, 2023 by gilbertlee-amd Committed by GitHub Aug 14, 2023
Showing with 290 additions and 112 deletions

CHANGELOG.md CHANGELOG.md +9 -0

src/TransferBench.cpp src/TransferBench.cpp +269 -101

src/include/EnvVars.hpp src/include/EnvVars.hpp +10 -5

src/include/TransferBench.hpp src/include/TransferBench.hpp +2 -6

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench
+## v1.26
+### Added
+- Setting SHOW_ITERATIONS=1 provides additional information about per-iteration timing for file and p2p configs
+  - For file configs, iterations are sorted from min to max bandwidth and displayed with standard deviation
+  - For p2p, min/max/standard deviation is shown for each direction.
+### Changed
+- P2P benchmark formatting changed.  Now reports bidirectional bandwidth in each direction (as well as sum) for clarity
 ## v1.25
 ### Fixed
 - Fixed bug in P2P bidirectional benchmark using incorrect number of subExecutors for CPU<->GPU tests

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -445,6 +445,33 @@ void ExecuteTransfers(EnvVars const& ev,
                 ExeTypeName[transfer->exeType], transfer->exeIndex,
                 transfer->numSubExecs,
                 transfer->DstToStr().c_str());
+          if (ev.showIterations)
+          {
+            std::set<std::pair<double, int>> times;
+            double stdDevTime = 0;
+            double stdDevBw = 0;
+            for (int i = 0; i < numTimedIterations; i++)
+            {
+              times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
+              double const varTime = fabs(transferDurationMsec - transfer->perIterationTime[i]);
+              stdDevTime += varTime * varTime;
+              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
+              double const varBw = fabs(iterBandwidthGbs - transferBandwidthGbs);
+              stdDevBw += varBw * varBw;
+            }
+            stdDevTime = sqrt(stdDevTime / numTimedIterations);
+            stdDevBw = sqrt(stdDevBw / numTimedIterations);
+            for (auto t : times)
+            {
+              double iterDurationMsec = t.first;
+              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
+              printf("      Iter %03d    | %7.3f GB/s | %8.3f ms |\n", t.second, iterBandwidthGbs, iterDurationMsec);
+            }
+            printf("      StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
+          }
        }
        else
        {
@@ -488,6 +515,33 @@ void ExecuteTransfers(EnvVars const& ev,
               ExeTypeName[transfer->exeType], transfer->exeIndex,
               transfer->numSubExecs,
               transfer->DstToStr().c_str());
+        if (ev.showIterations)
+        {
+            std::set<std::pair<double, int>> times;
+            double stdDevTime = 0;
+            double stdDevBw = 0;
+            for (int i = 0; i < numTimedIterations; i++)
+            {
+              times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
+              double const varTime = fabs(transferDurationMsec - transfer->perIterationTime[i]);
+              stdDevTime += varTime * varTime;
+              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
+              double const varBw = fabs(iterBandwidthGbs - transferBandwidthGbs);
+              stdDevBw += varBw * varBw;
+            }
+            stdDevTime = sqrt(stdDevTime / numTimedIterations);
+            stdDevBw = sqrt(stdDevBw / numTimedIterations);
+            for (auto t : times)
+            {
+              double iterDurationMsec = t.first;
+              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
+              printf("      Iter %03d    | %7.3f GB/s | %8.3f ms |\n", t.second, iterBandwidthGbs, iterDurationMsec);
+            }
+            printf("      StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
+        }
      }
      else
      {
@@ -1184,12 +1238,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
          int const wallClockRate = GetWallClockRate(exeIndex);
          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
          currTransfer->transferTime += iterationTimeMs;
+          if (ev.showIterations)
+            currTransfer->perIterationTime.push_back(iterationTimeMs);
        }
        exeInfo.totalTime += gpuDeltaMsec;
      }
      else
      {
        transfer->transferTime += gpuDeltaMsec;
+        if (ev.showIterations)
+          transfer->perIterationTime.push_back(gpuDeltaMsec);
      }
    }
  }
@@ -1224,6 +1282,8 @@ void RunTransfer(EnvVars const& ev, int const iteration,
      float gpuDeltaMsec;
      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
      transfer->transferTime += gpuDeltaMsec;
+      if (ev.showIterations)
+        transfer->perIterationTime.push_back(gpuDeltaMsec);
    }
  }
  else if (transfer->exeType == EXE_CPU) // CPU execution agent
@@ -1252,7 +1312,12 @@ void RunTransfer(EnvVars const& ev, int const iteration,
    // Record time if not a warmup iteration
    if (iteration >= 0)
-      transfer->transferTime += (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+    {
+      double const delta = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
+      transfer->transferTime += delta;
+      if (ev.showIterations)
+        transfer->perIterationTime.push_back(delta);
+    }
  }
 }
@@ -1260,6 +1325,9 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
 {
  ev.DisplayP2PBenchmarkEnvVars();
+  char const separator = ev.outputToCsv ? ',' : ' ';
+  printf("Bytes Per Direction%c%lu\n", separator, N * sizeof(float));
  // Collect the number of available CPUs/GPUs on this machine
  int const numCpus    = ev.numCpuDevices;
  int const numGpus    = ev.numGpuDevices;
@@ -1273,29 +1341,37 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
  // Perform unidirectional / bidirectional
  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
  {
+    printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
+           ev.useRemoteRead ? "Remote" : "Local",
+           ev.useRemoteRead ? "Local" : "Remote",
+           ev.useDmaCopy    ? "DMA"   : "GFX");
    // Print header
-    if (!ev.outputToCsv)
+    if (isBidirectional)
    {
-      printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
+      printf("%12s", "SRC\\DST");
-             ev.useRemoteRead ? "Remote" : "Local",
+    }
-             ev.useRemoteRead ? "Local" : "Remote",
+    else
-             ev.useDmaCopy    ? "DMA"   : "GFX");
+    {
+      if (ev.useRemoteRead)
-      if (isBidirectional)
+        printf("%12s", "SRC\\EXE+DST");
-      {
-        printf("%12s", "SRC\\DST");
-      }
      else
-      {
+        printf("%12s", "SRC+EXE\\DST");
-        if (ev.useRemoteRead)
+    }
-          printf("%12s", "SRC\\EXE+DST");
+    if (ev.outputToCsv) printf(",");
-        else
+    for (int i = 0; i < numCpus; i++)
-          printf("%12s", "SRC+EXE\\DST");
+    {
-      }
+      printf("%7s %02d", "CPU", i);
-      for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i);
+      if (ev.outputToCsv) printf(",");
-      for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i);
+    }
-      printf("\n");
+    for (int i = 0; i < numGpus; i++)
+    {
+      printf("%7s %02d", "GPU", i);
+      if (ev.outputToCsv) printf(",");
    }
+    printf("\n");
+    ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
    // Loop over all possible src/dst pairs
    for (int src = 0; src < numDevices; src++)
@@ -1303,38 +1379,193 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
-      if (!ev.outputToCsv)
+      std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
-        printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
+      std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
+      std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
+      std::vector<std::vector<double>> stdDev(isBidirectional + 1);
      for (int dst = 0; dst < numDevices; dst++)
      {
        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
-        double bandwidth = GetPeakBandwidth(ev, N, isBidirectional, srcType, srcIndex, dstType, dstIndex);
+        // Prepare Transfers
-        if (!ev.outputToCsv)
+        std::vector<Transfer> transfers(isBidirectional + 1);
+        // SRC -> DST
+        transfers[0].numBytes = N * sizeof(float);
+        transfers[0].srcType.push_back(srcType);
+        transfers[0].dstType.push_back(dstType);
+        transfers[0].srcIndex.push_back(srcIndex);
+        transfers[0].dstIndex.push_back(dstIndex);
+        transfers[0].numSrcs = transfers[0].numDsts = 1;
+        transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
+        transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
+        transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+        // DST -> SRC
+        if (isBidirectional)
+        {
+          transfers[1].numBytes = N * sizeof(float);
+          transfers[1].numSrcs = transfers[1].numDsts = 1;
+          transfers[1].srcType.push_back(dstType);
+          transfers[1].dstType.push_back(srcType);
+          transfers[1].srcIndex.push_back(dstIndex);
+          transfers[1].dstIndex.push_back(srcIndex);
+          transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
+          transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
+          transfers[1].numSubExecs = IsGpuType(transfers[1].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
+        }
+        bool skipTest = false;
+        // Abort if executing on NUMA node with no CPUs
+        for (int i = 0; i <= isBidirectional; i++)
        {
-          if (bandwidth == 0)
+          if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
+          {
+            skipTest = true;
+            break;
+          }
+#if defined(__NVCC__)
+          // NVIDIA platform cannot access GPU memory directly from CPU executors
+          if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
+          {
+            skipTest = true;
+            break;
+          }
+#endif
+        }
+        if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;
+        if (!skipTest)
+        {
+          ExecuteTransfers(ev, 0, N, transfers, false);
+          for (int dir = 0; dir <= isBidirectional; dir++)
+          {
+            double const avgTime = transfers[dir].transferTime / ev.numIterations;
+            double const avgBw   = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f;
+            avgBandwidth[dir].push_back(avgBw);
+            if (ev.showIterations)
+            {
+              double minTime = transfers[dir].perIterationTime[0];
+              double maxTime = transfers[dir].perIterationTime[0];
+              double varSum  = 0;
+              for (int i = 0; i < transfers[dir].perIterationTime.size(); i++)
+              {
+                minTime = std::min(minTime, transfers[dir].perIterationTime[i]);
+                maxTime = std::max(maxTime, transfers[dir].perIterationTime[i]);
+                double const bw  = (transfers[dir].numBytesActual / 1.0E9) / transfers[dir].perIterationTime[i] * 1000.0f;
+                double const delta = (avgBw - bw);
+                varSum += delta * delta;
+              }
+              double const minBw = (transfers[dir].numBytesActual / 1.0E9) / maxTime * 1000.0f;
+              double const maxBw = (transfers[dir].numBytesActual / 1.0E9) / minTime * 1000.0f;
+              double const stdev = sqrt(varSum / transfers[dir].perIterationTime.size());
+              minBandwidth[dir].push_back(minBw);
+              maxBandwidth[dir].push_back(maxBw);
+              stdDev[dir].push_back(stdev);
+            }
+          }
+        }
+        else
+        {
+          for (int dir = 0; dir <= isBidirectional; dir++)
+          {
+            avgBandwidth[dir].push_back(0);
+            minBandwidth[dir].push_back(0);
+            maxBandwidth[dir].push_back(0);
+            stdDev[dir].push_back(-1.0);
+          }
+        }
+      }
+      for (int dir = 0; dir <= isBidirectional; dir++)
+      {
+        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
+        if (ev.outputToCsv) printf(",");
+        for (int dst = 0; dst < numDevices; dst++)
+        {
+          double const avgBw = avgBandwidth[dir][dst];
+          if (avgBw == 0.0)
            printf("%10s", "N/A");
          else
-            printf("%10.2f", bandwidth);
+            printf("%10.2f", avgBw);
+          if (ev.outputToCsv) printf(",");
        }
-        else
+        printf("\n");
+        if (ev.showIterations)
        {
-          printf("%s %02d,%s %02d,%s,%s,%s,%.2f,%lu\n",
+          // minBw
-                 srcType == MEM_CPU ? "CPU" : "GPU", srcIndex,
+          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
-                 dstType == MEM_CPU ? "CPU" : "GPU", dstIndex,
+          if (ev.outputToCsv) printf(",");
-                 isBidirectional ? "bidirectional" : "unidirectional",
-                 ev.useRemoteRead ? "Remote" : "Local",
+          for (int i = 0; i < numDevices; i++)
-                 ev.useDmaCopy ? "DMA" : "GFX",
+          {
-                 bandwidth,
+            double const minBw = minBandwidth[dir][i];
-                 N * sizeof(float));
+            if (minBw == 0.0)
+              printf("%10s", "N/A");
+            else
+              printf("%10.2f", minBw);
+            if (ev.outputToCsv) printf(",");
+          }
+          printf("\n");
+          // maxBw
+          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
+          if (ev.outputToCsv) printf(",");
+          for (int i = 0; i < numDevices; i++)
+          {
+            double const maxBw = maxBandwidth[dir][i];
+            if (maxBw == 0.0)
+              printf("%10s", "N/A");
+            else
+              printf("%10.2f", maxBw);
+            if (ev.outputToCsv) printf(",");
+          }
+          printf("\n");
+          // stddev
+          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
+          if (ev.outputToCsv) printf(",");
+          for (int i = 0; i < numDevices; i++)
+          {
+            double const sd = stdDev[dir][i];
+            if (sd == -1.0)
+              printf("%10s", "N/A");
+            else
+              printf("%10.2f", sd);
+            if (ev.outputToCsv) printf(",");
+          }
+          printf("\n");
        }
        fflush(stdout);
      }
-      if (!ev.outputToCsv) printf("\n");
+      if (isBidirectional)
+      {
+        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
+        if (ev.outputToCsv) printf(",");
+        for (int dst = 0; dst < numDevices; dst++)
+        {
+          double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
+          if (sumBw == 0.0)
+            printf("%10s", "N/A");
+          else
+            printf("%10.2f", sumBw);
+          if (ev.outputToCsv) printf(",");
+        }
+        if (src < numDevices - 1) printf("\n\n");
+      }
    }
-    if (!ev.outputToCsv) printf("\n");
+    printf("\n");
  }
 }
@@ -1475,70 +1706,6 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
  printf("Aggregate bandwidth (CPU Timed): %7.2f\n", totalBandwidthCpu);
 }
-double GetPeakBandwidth(EnvVars const& ev, size_t const N,
-                        int     const  isBidirectional,
-                        MemType const  srcType, int const srcIndex,
-                        MemType const  dstType, int const dstIndex)
-{
-  // Skip bidirectional on same device
-  if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
-  // Prepare Transfers
-  std::vector<Transfer> transfers(2);
-  transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
-  // SRC -> DST
-  transfers[0].numSrcs = transfers[0].numDsts = 1;
-  transfers[0].srcType.push_back(srcType);
-  transfers[0].dstType.push_back(dstType);
-  transfers[0].srcIndex.push_back(srcIndex);
-  transfers[0].dstIndex.push_back(dstIndex);
-  // DST -> SRC
-  transfers[1].numSrcs = transfers[1].numDsts = 1;
-  transfers[1].srcType.push_back(dstType);
-  transfers[1].dstType.push_back(srcType);
-  transfers[1].srcIndex.push_back(dstIndex);
-  transfers[1].dstIndex.push_back(srcIndex);
-  // Either perform (local read + remote write), or (remote read + local write)
-  ExeType gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
-  transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
-  transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
-  transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
-  transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
-  transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-  transfers[1].numSubExecs = IsGpuType(transfers[1].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-  // Remove (DST->SRC) if not bidirectional
-  transfers.resize(isBidirectional + 1);
-  // Abort if executing on NUMA node with no CPUs
-  for (int i = 0; i <= isBidirectional; i++)
-  {
-    if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
-      return 0;
-#if defined(__NVCC__)
-    // NVIDIA platform cannot access GPU memory directly from CPU executors
-    if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
-        return 0;
-#endif
-  }
-  ExecuteTransfers(ev, 0, N, transfers, false);
-  // Collect aggregate bandwidth
-  double totalBandwidth = 0;
-  for (int i = 0; i <= isBidirectional; i++)
-  {
-    double transferDurationMsec = transfers[i].transferTime / (1.0 * ev.numIterations);
-    double transferBandwidthGbs = (transfers[i].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
-    totalBandwidth += transferBandwidthGbs;
-  }
-  return totalBandwidth;
-}
 void Transfer::PrepareSubExecParams(EnvVars const& ev)
 {
  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
@@ -1582,6 +1749,7 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
  }
  this->transferTime = 0.0;
+  this->perIterationTime.clear();
 }
 void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"
-#define TB_VERSION "1.25"
+#define TB_VERSION "1.26"
 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -75,6 +75,7 @@ public:
  int outputToCsv;       // Output in CSV format
  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
  int sharedMemBytes;    // Amount of shared memory to use per threadblock
+  int showIterations;    // Show per-iteration timing info
  int useInteractive;    // Pause for user-input before starting transfer loop
  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
  int usePrepSrcKernel;  // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
@@ -155,6 +156,7 @@ public:
    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , defaultSharedMemBytes);
+    showIterations    = GetEnvVar("SHOW_ITERATIONS"     , 0);
    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
    usePrepSrcKernel  = GetEnvVar("USE_PREP_KERNEL"     , 0);
@@ -164,10 +166,10 @@ public:
    gpuKernel         = GetEnvVar("GPU_KERNEL"          , defaultGpuKernel);
    // P2P Benchmark related
-    useRemoteRead    = GetEnvVar("USE_REMOTE_READ"      , 0);
+    useRemoteRead     = GetEnvVar("USE_REMOTE_READ"     , 0);
-    useDmaCopy       = GetEnvVar("USE_GPU_DMA"          , 0);
+    useDmaCopy        = GetEnvVar("USE_GPU_DMA"         , 0);
-    numGpuSubExecs   = GetEnvVar("NUM_GPU_SE"           , useDmaCopy ? 1 : numDeviceCUs);
+    numGpuSubExecs    = GetEnvVar("NUM_GPU_SE"          , useDmaCopy ? 1 : numDeviceCUs);
-    numCpuSubExecs   = GetEnvVar("NUM_CPU_SE"           , DEFAULT_P2P_NUM_CPU_SE);
+    numCpuSubExecs    = GetEnvVar("NUM_CPU_SE"          , DEFAULT_P2P_NUM_CPU_SE);
    // Sweep related
    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
@@ -382,6 +384,7 @@ public:
    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
    printf(" SHARED_MEM_BYTES=X     - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
+    printf(" SHOW_ITERATIONS        - Show per-iteration timing info\n");
    printf(" USE_INTERACTIVE        - Pause for user-input before starting transfer loop\n");
    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
    printf(" USE_PREP_KERNEL        - Use GPU kernel to initialize source data array pattern\n");
@@ -429,6 +432,8 @@ public:
             std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
    PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
             std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
+    PRINT_EV("SHOW_ITERATIONS", showIterations,
+             std::string(showIterations ? "Showing" : "Hiding") + " per-iteration timing");
    PRINT_EV("USE_INTERACTIVE", useInteractive,
             std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
    PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,

--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
@@ -119,6 +119,8 @@ struct Transfer
  std::vector<SubExecParam> subExecParam;       // Defines subarrays assigned to each threadblock
  SubExecParam*             subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
+  std::vector<double>       perIterationTime;   // Per-iteration timing
  // Prepares src/dst subarray pointers for each SubExecutor
  void PrepareSubExecParams(EnvVars const& ev);
@@ -187,12 +189,6 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
 void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
 void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
-// Return the maximum bandwidth measured for given (src/dst) pair
-double GetPeakBandwidth(EnvVars const& ev, size_t  const  N,
-                        int     const  isBidirectional,
-                        MemType const  srcType, int const srcIndex,
-                        MemType const  dstType, int const dstIndex);
 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
 int RemappedIndex(int const origIdx, bool const isCpuType);