Fixing topology detection memory access and CU masking for multi XCD GPUs (#116)

* Fixing potential out-of-bounds write during topology detection * Fixing CU_MASK for multi-XCD GPUs * Adding sub-iterations via NUM_SUBITERATIONS * Adding support for variable subexecutor Transfers * Adding healthcheck preset

Fixing topology detection memory access and CU masking for multi XCD GPUs (#116)
* Fixing potential out-of-bounds write during topology detection * Fixing CU_MASK for multi-XCD GPUs * Adding sub-iterations via NUM_SUBITERATIONS * Adding support for variable subexecutor Transfers * Adding healthcheck preset
b30aefb6 · gilbertlee-amd · GitHub · ae843a6f · b30aefb6 · b30aefb6
Unverified Commit b30aefb6 authored Aug 15, 2024 by gilbertlee-amd Committed by GitHub Aug 15, 2024
6 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,29 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.51
+
+## Modified
+- CSV output has been modified slightly to match normal terminal output
+- Output for non single stream mode has been changed to match single stream mode (results per Executor)
+
+### Added
+- Support for sub-iterations via NUM_SUBITERATIONS.  This allows for additional looping during an iteration
+  If set to 0, this should infinitely loop (which may be useful for some debug purposes)
+- Support for variable number of subexecutors (currently for GPU-GFX executor only).  Setting subExecutors to
+  0 will run over a range of CUs to use, and report only the results of the best one found. This can be tuned
+  for performance by setting the MIN_VAR_SUBEXEC and MAX_VAR_SUBEXEC environment variables to narrow the
+  search space.  The number of CUs used will be identical for all variable subExecutor transfers
+- Experimental new "healthcheck" preset config which currently only supports MI300 series.  This preset runs
+  through CPU to GPU bandwidth tests and all-to-all XGMI bandwidth tests and compares against expected values
+  Pass criteria limits can be modified (due to platform differences) via the environment variables
+  LIMIT_UDIR (undirectional), LIMIT_BDIR (bidirectional), and LIMIT_A2A (Per GPU-GPU link bandwidth)
+
+### Fixed
+- Fixed out-of-bounds memory access during topology detection that can happen if the number of
+  CPUs is less than the number of NUMA domains
+- Fixed CU masking functionality on multi-XCD architectures (e.g. MI300)
+
 ## v1.50

 ### Added

--- a/README.md
+++ b/README.md
@@ -67,8 +67,9 @@ make
 * Running TransferBench with no arguments displays usage instructions and detected topology
  information
 * You can use several preset configurations instead of a configuration file:
-  * `a2a`    : All-to-all benchmark test
-  * `cmdline`: Take in Transfers to run from command-line instead of via file
+  * `a2a` : All-to-all benchmark test
+  * `cmdline` : Take in Transfers to run from command-line instead of via file
+  * `healthcheck` : Simple health check (supported on MI300 series only)
  * `p2p`    : Peer-to-peer benchmark test
  * `pcopy`  : Benchmark parallel copies from a single GPU to other GPUs
  * `rsweep` : Random sweep across possible sets of transfers

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -116,6 +116,11 @@ int main(int argc, char **argv)
    RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
    exit(0);
  }
+  // Health check
+  else if (!strcmp(argv[1], "healthcheck")) {
+    RunHealthCheck(ev);
+    exit(0);
+  }
  // - Test schmoo benchmark
  else if (!strcmp(argv[1], "schmoo"))
  {
@@ -211,12 +216,8 @@ int main(int argc, char **argv)
  }
  else if (!strcmp(argv[1], "cmdline"))
  {
-    // Print environment variables and CSV header
+    // Print environment variables
    ev.DisplayEnvVars();
-    if (ev.outputToCsv)
-    {
-      printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
-    }

    // Read Transfer from command line
    std::string cmdlineTransfer;
@@ -263,10 +264,6 @@ int main(int argc, char **argv)

  // Print environment variables and CSV header
  ev.DisplayEnvVars();
-  if (ev.outputToCsv)
-  {
-    printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
-  }

  int testNum = 0;
  char line[MAX_LINE_LEN];
@@ -313,8 +310,78 @@ void ExecuteTransfers(EnvVars const& ev,
                      bool verbose,
                      double* totalBandwidthCpu)
 {
-  int const initOffset = ev.byteOffset / sizeof(float);
+  // Check for any Transfers using variable number of sub-executors
+  std::vector<int> varTransfers;
+  std::vector<int> numUsedSubExec(ev.numGpuDevices, 0);
+  std::vector<int> numVarSubExec(ev.numGpuDevices, 0);
+
+  for (int i = 0; i < transfers.size(); i++) {
+    Transfer& t = transfers[i];
+    t.transferIndex = i;
+    t.numBytesActual = (t.numBytes ? t.numBytes : N * sizeof(float));
+
+    if (t.exeType == EXE_GPU_GFX) {
+      if (t.numSubExecs == 0) {
+        varTransfers.push_back(i);
+        numVarSubExec[t.exeIndex]++;
+      } else {
+        numUsedSubExec[t.exeIndex] += t.numSubExecs;
+      }
+    } else if (t.numSubExecs == 0) {
+      printf("[ERROR] Variable subexecutor count is only supported for GFX executor\n");
+      exit(1);
+    }
+  }

+  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
+
+  TestResults testResults;
+  if (varTransfers.size() == 0) {
+    testResults = ExecuteTransfersImpl(ev, transfers);
+  } else {
+    // Determine maximum number of subexecutors
+    int maxNumSubExec = 0;
+    if (ev.maxNumVarSubExec) {
+      maxNumSubExec = ev.maxNumVarSubExec;
+    } else {
+      HIP_CALL(hipDeviceGetAttribute(&maxNumSubExec, hipDeviceAttributeMultiprocessorCount, 0));
+      for (int device = 0; device < ev.numGpuDevices; device++) {
+        int numSubExec = 0;
+        HIP_CALL(hipDeviceGetAttribute(&numSubExec, hipDeviceAttributeMultiprocessorCount, device));
+        int leftOverSubExec = numSubExec - numUsedSubExec[device];
+        if (leftOverSubExec < numVarSubExec[device])
+          maxNumSubExec = 1;
+        else if (numVarSubExec[device] != 0) {
+          maxNumSubExec = std::min(maxNumSubExec, leftOverSubExec / numVarSubExec[device]);
+        }
+      }
+    }
+
+    // Loop over subexecs
+    std::vector<Transfer> bestTransfers;
+    for (int numSubExec = ev.minNumVarSubExec; numSubExec <= maxNumSubExec; numSubExec++) {
+      std::vector<Transfer> currTransfers = transfers;
+      for (auto idx : varTransfers) {
+        currTransfers[idx].numSubExecs = numSubExec;
+      }
+      TestResults tempResults = ExecuteTransfersImpl(ev, currTransfers);
+      if (tempResults.totalBandwidthCpu > testResults.totalBandwidthCpu) {
+        bestTransfers = currTransfers;
+        testResults = tempResults;
+      }
+    }
+    transfers = bestTransfers;
+  }
+  if (totalBandwidthCpu) *totalBandwidthCpu = testResults.totalBandwidthCpu;
+
+  if (verbose) {
+    ReportResults(ev, transfers, testResults);
+  }
+}
+
+TestResults ExecuteTransfersImpl(EnvVars const& ev,
+                                 std::vector<Transfer>& transfers)
+{
  // Map transfers by executor
  TransferMap transferMap;
  for (int i = 0; i < transfers.size(); i++)
@@ -341,15 +408,12 @@ void ExecuteTransfers(EnvVars const& ev,
    // Loop over each transfer this executor is involved in
    for (Transfer* transfer : exeInfo.transfers)
    {
-      // Determine how many bytes to copy for this Transfer (use custom if pre-specified)
-      transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));
-
      // Allocate source memory
      transfer->srcMem.resize(transfer->numSrcs);
      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
      {
        MemType const& srcType  = transfer->srcType[iSrc];
-        int     const  srcIndex    = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
+        int     const  srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));

        // Ensure executing GPU can access source memory
        if (IsGpuType(exeType) && IsGpuType(srcType) && srcIndex != exeIndex)
@@ -363,7 +427,7 @@ void ExecuteTransfers(EnvVars const& ev,
      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
      {
        MemType const& dstType  = transfer->dstType[iDst];
-        int     const  dstIndex    = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
+        int     const  dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));

        // Ensure executing GPU can access destination memory
        if (IsGpuType(exeType) && IsGpuType(dstType) && dstIndex != exeIndex)
@@ -509,7 +573,6 @@ void ExecuteTransfers(EnvVars const& ev,
    }
  }

-  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);

  // Prepare input memory and block parameters for current N
  bool isSrcCorrect = true;
@@ -610,7 +673,7 @@ void ExecuteTransfers(EnvVars const& ev,
    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;

    // Pause before starting first timed iteration in interactive mode
-    if (verbose && ev.useInteractive && iteration == 0)
+    if (ev.useInteractive && iteration == 0)
    {
      printf("Memory prepared:\n");

@@ -674,7 +737,7 @@ void ExecuteTransfers(EnvVars const& ev,
  }

  // Pause for interactive mode
-  if (verbose && isSrcCorrect && ev.useInteractive)
+  if (isSrcCorrect && ev.useInteractive)
  {
    printf("Transfers complete. Hit <Enter> to continue: ");
    if (scanf("%*c") != 0)
@@ -695,243 +758,47 @@ void ExecuteTransfers(EnvVars const& ev,
    totalBytesTransferred += transfer->numBytesActual;
  }

-  // Report timings
-  totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
-  double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
-  if (totalBandwidthCpu) *totalBandwidthCpu = totalBandwidthGbs;
-
-  double maxGpuTime = 0;
+  // Record results
+  TestResults testResults;
+  testResults.numTimedIterations = numTimedIterations;
+  testResults.totalBytesTransferred = totalBytesTransferred;
+  testResults.totalDurationMsec = totalCpuTime / (1.0 * numTimedIterations * ev.numSubIterations) * 1000;
+  testResults.totalBandwidthCpu = (totalBytesTransferred / 1.0E6) / testResults.totalDurationMsec;

+  double maxExeDurationMsec = 0.0;
  if (!isSrcCorrect) goto cleanup;
-  if (ev.useSingleStream)
-  {
-    for (auto& exeInfoPair : transferMap)
-    {
-      ExecutorInfo  exeInfo  = exeInfoPair.second;
-      ExeType const exeType  = exeInfoPair.first.first;
-      int     const exeIndex = exeInfoPair.first.second;
-
-      // Compute total time for non GPU executors
-      if (exeType != EXE_GPU_GFX)
-      {
-        exeInfo.totalTime = 0;
-        for (auto const& transfer : exeInfo.transfers)
-          exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
-      }

-      double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
-      double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
-      maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
-
-      double sumBandwidthGbs = 0.0;
-      for (auto& transfer: exeInfo.transfers)
-      {
-        transfer->transferTime /= (1.0 * numTimedIterations);
-        transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
-        transfer->executorBandwidth = exeBandwidthGbs;
-        sumBandwidthGbs += transfer->transferBandwidth;
-      }
-
-      if (verbose && !ev.outputToCsv)
-      {
-        printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %-7.3f GB/s (sum)\n",
-               ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes, sumBandwidthGbs);
-      }
+  for (auto& exeInfoPair : transferMap)
+  {
+    ExecutorInfo  exeInfo  = exeInfoPair.second;
+    ExeType const exeType  = exeInfoPair.first.first;
+    int     const exeIndex = exeInfoPair.first.second;
+    ExeResult& exeResult   = testResults.exeResults[std::make_pair(exeType, exeIndex)];

-      int totalCUs = 0;
+    // Compute total time for non GPU executors
+    if (exeType != EXE_GPU_GFX || ev.useSingleStream == 0)
+    {
+      exeInfo.totalTime = 0;
      for (auto const& transfer : exeInfo.transfers)
-      {
-        totalCUs += transfer->numSubExecs;
-
-        char exeSubIndexStr[32] = "";
-        if (ev.useXccFilter || transfer->exeType == EXE_GPU_DMA)
-        {
-          if (transfer->exeSubIndex == -1)
-            sprintf(exeSubIndexStr, ".*");
-          else
-            sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
-        }
-
-        if (!verbose) continue;
-        if (!ev.outputToCsv)
-        {
-          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
-                 transfer->transferIndex,
-                 transfer->transferBandwidth,
-                 transfer->transferTime,
-                 transfer->numBytesActual,
-                 transfer->SrcToStr().c_str(),
-                 ExeTypeName[transfer->exeType], transfer->exeIndex,
-                 exeSubIndexStr,
-                 transfer->numSubExecs,
-                 transfer->DstToStr().c_str());
-
-          if (ev.showIterations)
-          {
-            std::set<std::pair<double, int>> times;
-            double stdDevTime = 0;
-            double stdDevBw = 0;
-            for (int i = 0; i < numTimedIterations; i++)
-            {
-              times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
-              double const varTime = fabs(transfer->transferTime - transfer->perIterationTime[i]);
-              stdDevTime += varTime * varTime;
-
-              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
-              double const varBw = fabs(iterBandwidthGbs - transfer->transferBandwidth);
-              stdDevBw += varBw * varBw;
-            }
-            stdDevTime = sqrt(stdDevTime / numTimedIterations);
-            stdDevBw = sqrt(stdDevBw / numTimedIterations);
-
-            for (auto t : times)
-            {
-              double iterDurationMsec = t.first;
-              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
-              printf("      Iter %03d    | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);
-
-              std::set<int> usedXccs;
-              if (t.second - 1 < transfer->perIterationCUs.size())
-              {
-                printf(" CUs:");
-                for (auto x : transfer->perIterationCUs[t.second - 1])
-                {
-                  printf(" %02d:%02d", x.first, x.second);
-                  usedXccs.insert(x.first);
-                }
-              }
-              printf(" XCCs:");
-              for (auto x : usedXccs)
-                printf(" %02d", x);
-              printf("\n");
-            }
-            printf("      StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
-          }
-        }
-        else
-        {
-          printf("%d,%d,%lu,%s,%c%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
-                 testNum, transfer->transferIndex, transfer->numBytesActual,
-                 transfer->SrcToStr().c_str(),
-                 MemTypeStr[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
-                 transfer->DstToStr().c_str(),
-                 transfer->numSubExecs,
-                 transfer->transferBandwidth, transfer->transferTime,
-                 PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
-                 PtrVectorToStr(transfer->dstMem, initOffset).c_str());
-        }
-      }
-
-      if (verbose && ev.outputToCsv)
-      {
-        printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n",
-               testNum, totalBytesTransferred,
-               MemTypeStr[exeType], exeIndex, totalCUs,
-               exeBandwidthGbs, exeDurationMsec);
-      }
+        exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
    }
-  }
-  else
-  {
-    for (auto const& transferPair : transferList)
-    {
-      Transfer* transfer = transferPair.second;
-      transfer->transferTime /= (1.0 * numTimedIterations);
-      transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
-      transfer->executorBandwidth = transfer->transferBandwidth;
-      maxGpuTime = std::max(maxGpuTime, transfer->transferTime);
-      if (!verbose) continue;
-
-      char exeSubIndexStr[32] = "";
-      if (ev.useXccFilter)
-      {
-        if (transfer->exeSubIndex == -1 || transfer->exeType == EXE_GPU_DMA)
-          sprintf(exeSubIndexStr, ".*");
-        else
-          sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
-      }
-
-      if (!ev.outputToCsv)
-      {
-        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
-               transfer->transferIndex,
-               transfer->transferBandwidth, transfer->transferTime,
-               transfer->numBytesActual,
-               transfer->SrcToStr().c_str(),
-               ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
-               transfer->numSubExecs,
-               transfer->DstToStr().c_str());
-
-        if (ev.showIterations)
-        {
-            std::set<std::pair<double, int>> times;
-            double stdDevTime = 0;
-            double stdDevBw = 0;
-            for (int i = 0; i < numTimedIterations; i++)
-            {
-              times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
-              double const varTime = fabs(transfer->transferTime - transfer->perIterationTime[i]);
-              stdDevTime += varTime * varTime;
-
-              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
-              double const varBw = fabs(iterBandwidthGbs - transfer->transferBandwidth);
-              stdDevBw += varBw * varBw;
-            }
-            stdDevTime = sqrt(stdDevTime / numTimedIterations);
-            stdDevBw = sqrt(stdDevBw / numTimedIterations);

-            for (auto t : times)
-            {
-              double iterDurationMsec = t.first;
-              double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
-              printf("      Iter %03d    | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);
-              std::set<int> usedXccs;
-              if (t.second - 1 < transfer->perIterationCUs.size())
-              {
-                printf(" CUs:");
-                for (auto x : transfer->perIterationCUs[t.second - 1])
-                {
-                  printf(" %02d:%02d", x.first, x.second);
-                  usedXccs.insert(x.first);
-                }
-              }
-              printf(" XCCs:");
-              for (auto x : usedXccs)
-                printf(" %d", x);
-              printf("\n");
-            }
-            printf("      StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
-        }
-      }
-      else
-      {
-        printf("%d,%d,%lu,%s,%s%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
-               testNum, transfer->transferIndex, transfer->numBytesActual,
-               transfer->SrcToStr().c_str(),
-               ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
-               transfer->DstToStr().c_str(),
-               transfer->numSubExecs,
-               transfer->transferBandwidth, transfer->transferTime,
-               PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
-               PtrVectorToStr(transfer->dstMem, initOffset).c_str());
-      }
-    }
-  }
+    exeResult.totalBytes      = exeInfo.totalBytes;
+    exeResult.durationMsec    = exeInfo.totalTime / (1.0 * numTimedIterations * ev.numSubIterations);
+    exeResult.bandwidthGbs    = (exeInfo.totalBytes / 1.0E9) / exeResult.durationMsec * 1000.0f;
+    exeResult.sumBandwidthGbs = 0;
+    maxExeDurationMsec        = std::max(maxExeDurationMsec, exeResult.durationMsec);

-  // Display aggregate statistics
-  if (verbose)
-  {
-    if (!ev.outputToCsv)
-    {
-      printf(" Aggregate (CPU)  | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
-             totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
-    }
-    else
+    for (auto& transfer: exeInfo.transfers)
    {
-      printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n",
-             testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
+      exeResult.transferIdx.push_back(transfer->transferIndex);
+      transfer->transferTime /= (1.0 * numTimedIterations * ev.numSubIterations);
+      transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
+      transfer->executorBandwidth = exeResult.bandwidthGbs;
+      exeResult.sumBandwidthGbs += transfer->transferBandwidth;
    }
  }
+  testResults.overheadMsec = testResults.totalDurationMsec - maxExeDurationMsec;

  // Release GPU memory
 cleanup:
@@ -983,6 +850,8 @@ cleanup:
      }
    }
  }
+
+  return testResults;
 }

 void DisplayUsage(char const* cmdName)
@@ -1006,6 +875,7 @@ void DisplayUsage(char const* cmdName)
  printf("              a2a          - GPU All-To-All benchmark\n");
  printf("                             - 3rd optional arg: # of SubExecs to use\n");
  printf("              cmdline      - Read Transfers from command line arguments (after N)\n");
+  printf("              healthcheck  - Simple bandwidth health check (MI300 series only)\n");
  printf("              p2p          - Peer-to-peer benchmark tests\n");
  printf("              rwrite/pcopy - Parallel writes/copies from single GPU to other GPUs\n");
  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
@@ -1396,9 +1266,9 @@ void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transf
  if (!advancedMode)
  {
    iss >> numSubExecs;
-    if (numSubExecs <= 0 || iss.fail())
+    if (numSubExecs < 0 || iss.fail())
    {
-      printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs);
+      printf("Parsing error: Number of blocks to use (%d) must be non-negative\n", numSubExecs);
      exit(1);
    }
  }
@@ -1683,14 +1553,14 @@ void RunTransfer(EnvVars const& ev, int const iteration,
 #if defined(__NVCC__)
    HIP_CALL(hipEventRecord(startEvent, stream));
    GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1]
-      <<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
+      <<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
    HIP_CALL(hipEventRecord(stopEvent, stream));
 #else
    hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1],
                          gridSize, blockSize,
                          ev.sharedMemBytes, stream,
                          startEvent, stopEvent,
-                          0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
+                          0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
 #endif
    // Synchronize per iteration, unless in single sync mode, in which case
    // synchronize during last warmup / last actual iteration
@@ -1757,13 +1627,13 @@ void RunTransfer(EnvVars const& ev, int const iteration,
      hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
      hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];

+      int subIteration = 0;
      HIP_CALL(hipEventRecord(startEvent, stream));
-      if (transfer->numSrcs == 1 && transfer->numDsts == 1)
-      {
+      do {
        HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
                                transfer->numBytesActual, hipMemcpyDefault,
                                stream));
-      }
+      } while (++subIteration != ev.numSubIterations);
      HIP_CALL(hipEventRecord(stopEvent, stream));
      HIP_CALL(hipStreamSynchronize(stream));

@@ -1772,6 +1642,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,
        // Record GPU timing
        float gpuDeltaMsec;
        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+        //gpuDeltaMsec /= (1.0 * ev.numSubIterations);
        transfer->transferTime += gpuDeltaMsec;
        if (ev.showIterations)
          transfer->perIterationTime.push_back(gpuDeltaMsec);
@@ -1784,23 +1655,27 @@ void RunTransfer(EnvVars const& ev, int const iteration,
      exit(1);
 #else
      // Target specific DMA engine
+      auto cpuStart = std::chrono::high_resolution_clock::now();

-      // Atomically set signal to 1
-      HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
+      int subIterations = 0;
+      do {
+        // Atomically set signal to 1
+        HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
+
+        HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
+                                                     transfer->srcMem[0], transfer->srcAgent,
+                                                     transfer->numBytesActual, 0, NULL,
+                                                     transfer->signal,
+                                                     transfer->sdmaEngineId, true));
+        // Wait for SDMA transfer to complete
+        // NOTE: "A wait operation can spuriously resume at any time sooner than the timeout
+        //        (for example, due to system or other external factors) even when the
+        //         condition has not been met.)
+        while(hsa_signal_wait_scacquire(transfer->signal,
+                                        HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
+                                        HSA_WAIT_STATE_ACTIVE) >= 1);
+      } while (++subIterations < ev.numSubIterations);

-      auto cpuStart = std::chrono::high_resolution_clock::now();
-      HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
-                                                   transfer->srcMem[0], transfer->srcAgent,
-                                                   transfer->numBytesActual, 0, NULL,
-                                                   transfer->signal,
-                                                   transfer->sdmaEngineId, true));
-      // Wait for SDMA transfer to complete
-      // NOTE: "A wait operation can spuriously resume at any time sooner than the timeout
-      //        (for example, due to system or other external factors) even when the
-      //         condition has not been met.)
-      while(hsa_signal_wait_scacquire(transfer->signal,
-                                      HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
-                                      HSA_WAIT_STATE_ACTIVE) >= 1);
      if (iteration >= 0)
      {
        // Record GPU timing
@@ -1825,15 +1700,18 @@ void RunTransfer(EnvVars const& ev, int const iteration,

    std::vector<std::thread> childThreads;

+    int subIteration = 0;
    auto cpuStart = std::chrono::high_resolution_clock::now();
+    do {
+      // Launch each subExecutor in child-threads to perform memcopies
+      for (int i = 0; i < transfer->numSubExecs; ++i)
+        childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));

-    // Launch each subExecutor in child-threads to perform memcopies
-    for (int i = 0; i < transfer->numSubExecs; ++i)
-      childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
-
-    // Wait for child-threads to finish
-    for (int i = 0; i < transfer->numSubExecs; ++i)
-      childThreads[i].join();
+      // Wait for child-threads to finish
+      for (int i = 0; i < transfer->numSubExecs; ++i)
+        childThreads[i].join();
+      childThreads.clear();
+    } while (++subIteration != ev.numSubIterations);

    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;

@@ -3171,3 +3049,316 @@ std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initO
  }
  return ss.str();
 }
+
+void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results)
+{
+  char sep = ev.outputToCsv ? ',' : '|';
+  size_t numTimedIterations = results.numTimedIterations;
+
+  // Loop over each executor
+  for (auto exeInfoPair : results.exeResults) {
+    ExeResult const& exeResult = exeInfoPair.second;
+    ExeType exeType  = exeInfoPair.first.first;
+    int     exeIndex = exeInfoPair.first.second;
+
+    printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
+           ExeTypeName[exeType], exeIndex, sep, exeResult.bandwidthGbs, sep,
+           exeResult.durationMsec, sep, exeResult.totalBytes, sep, exeResult.sumBandwidthGbs);
+
+    for (int idx : exeResult.transferIdx) {
+      Transfer const& t = transfers[idx];
+
+      char exeSubIndexStr[32] = "";
+      if (ev.useXccFilter || t.exeType == EXE_GPU_DMA) {
+        if (t.exeSubIndex == -1)
+          sprintf(exeSubIndexStr, ".*");
+        else
+          sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
+      }
+      printf("     Transfer %02d  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
+             t.transferIndex, sep,
+             t.transferBandwidth, sep,
+             t.transferTime, sep,
+             t.numBytesActual, sep,
+             t.SrcToStr().c_str(),
+             ExeTypeName[t.exeType], t.exeIndex,
+             exeSubIndexStr,
+             t.numSubExecs,
+             t.DstToStr().c_str());
+
+      // Show per-iteration timing information
+      if (ev.showIterations) {
+
+        std::set<std::pair<double, int>> times;
+        double stdDevTime = 0;
+        double stdDevBw = 0;
+        for (int i = 0; i < numTimedIterations; i++) {
+          times.insert(std::make_pair(t.perIterationTime[i], i+1));
+          double const varTime = fabs(t.transferTime - t.perIterationTime[i]);
+          stdDevTime += varTime * varTime;
+
+          double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / t.perIterationTime[i] * 1000.0f;
+          double const varBw = fabs(iterBandwidthGbs - t.transferBandwidth);
+          stdDevBw += varBw * varBw;
+        }
+        stdDevTime = sqrt(stdDevTime / numTimedIterations);
+        stdDevBw = sqrt(stdDevBw / numTimedIterations);
+
+        for (auto time : times) {
+          double iterDurationMsec = time.first;
+          double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
+          printf("      Iter %03d    %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
+
+          std::set<int> usedXccs;
+          if (time.second - 1 < t.perIterationCUs.size()) {
+            printf(" CUs:");
+            for (auto x : t.perIterationCUs[time.second - 1]) {
+              printf(" %02d:%02d", x.first, x.second);
+              usedXccs.insert(x.first);
+            }
+          }
+          printf(" XCCs:");
+          for (auto x : usedXccs)
+            printf(" %02d", x);
+          printf("\n");
+        }
+        printf("      StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
+      }
+    }
+  }
+
+  printf(" Aggregate (CPU)  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n", sep,
+         results.totalBandwidthCpu, sep, results.totalDurationMsec, sep, results.totalBytesTransferred, sep, results.overheadMsec);
+}
+
+void RunHealthCheck(EnvVars ev)
+{
+  // Check for supported platforms
+#if defined(__NVCC__)
+  printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
+  return;
+#else
+
+  bool hasFail = false;
+
+  // Force use of single stream
+  ev.useSingleStream = 1;
+
+  for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
+    std::string fullName = prop.gcnArchName;
+    std::string archName = fullName.substr(0, fullName.find(':'));
+    if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
+    {
+      printf("[WARN] healthcheck preset is currently only supported on MI300 series hardware\n");
+      exit(1);
+    }
+  }
+
+  // Pass limits
+  double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
+  double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
+  double a2aLimit  = getenv("LIMIT_A2A")  ? atof(getenv("LIMIT_A2A"))  : (int)(45 * 0.95);
+
+  // Run CPU to GPU
+
+  // Run unidirectional read from CPU to GPU
+  printf("Testing unidirectional reads from CPU ");
+  {
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+      std::vector<Transfer> transfers(1);
+      Transfer& t = transfers[0];
+      t.exeType     = EXE_GPU_GFX;
+      t.exeIndex    = gpuId;
+      t.numBytes    = 64*1024*1024;
+      t.numBytesActual = 64*1024*1024;
+      t.numSrcs     = 1;
+      t.srcType.push_back(MEM_CPU);
+      t.srcIndex.push_back(GetClosestNumaNode(gpuId));
+      t.numDsts     = 0;
+      t.dstType.clear();
+      t.dstIndex.clear();
+
+    // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t.numSubExecs = cu;
+        TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
+        bestResult = std::max(bestResult, t.transferBandwidth);
+        if (t.transferBandwidth >= udirLimit) {
+          passed = true;
+          break;
+      }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+      }
+    }
+  }
+
+  // Run unidirectional write from GPU to CPU
+  printf("Testing unidirectional writes to  CPU ");
+  {
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+      std::vector<Transfer> transfers(1);
+      Transfer& t = transfers[0];
+      t.exeType     = EXE_GPU_GFX;
+      t.exeIndex    = gpuId;
+      t.numBytes    = 64*1024*1024;
+      t.numBytesActual = 64*1024*1024;
+      t.numDsts     = 1;
+      t.dstType.push_back(MEM_CPU);
+      t.dstIndex.push_back(GetClosestNumaNode(gpuId));
+      t.numSrcs     = 0;
+      t.srcType.clear();
+      t.srcIndex.clear();
+
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t.numSubExecs = cu;
+
+        TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
+        bestResult = std::max(bestResult, t.transferBandwidth);
+        if (t.transferBandwidth >= udirLimit) {
+          passed = true;
+          break;
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+      }
+    }
+  }
+
+  // Run bidirectional tests
+  printf("Testing bidirectional  reads + writes ");
+  {
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+      std::vector<Transfer> transfers(2);
+      Transfer& t0 = transfers[0];
+      Transfer& t1 = transfers[1];
+
+      t0.exeType     = EXE_GPU_GFX;
+      t0.exeIndex    = gpuId;
+      t0.numBytes    = 64*1024*1024;
+      t0.numBytesActual = 64*1024*1024;
+      t0.numSrcs     = 1;
+      t0.srcType.push_back(MEM_CPU);
+      t0.srcIndex.push_back(GetClosestNumaNode(gpuId));
+      t0.numDsts     = 0;
+      t0.dstType.clear();
+      t0.dstIndex.clear();
+
+      t1.exeType     = EXE_GPU_GFX;
+      t1.exeIndex    = gpuId;
+      t1.numBytes    = 64*1024*1024;
+      t1.numBytesActual = 64*1024*1024;
+      t1.numDsts     = 1;
+      t1.dstType.push_back(MEM_CPU);
+      t1.dstIndex.push_back(GetClosestNumaNode(gpuId));
+      t1.numSrcs     = 0;
+      t1.srcType.clear();
+      t1.srcIndex.clear();
+
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t0.numSubExecs = cu;
+        t1.numSubExecs = cu;
+
+        TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
+        double sum = t0.transferBandwidth + t1.transferBandwidth;
+        bestResult = std::max(bestResult, sum);
+        if (sum >= bdirLimit) {
+          passed = true;
+          break;
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
+      }
+    }
+  }
+
+  // Run XGMI tests:
+  printf("Testing all-to-all XGMI copies        "); fflush(stdout);
+  {
+    ev.gfxUnroll = 2;
+    std::vector<Transfer> transfers;
+    for (int i = 0; i < ev.numGpuDevices; i++) {
+      for (int j = 0; j < ev.numGpuDevices; j++) {
+        if (i == j) continue;
+        Transfer t;
+        t.exeType = EXE_GPU_GFX;
+        t.exeIndex = i;
+        t.numBytes = t.numBytesActual = 64*1024*1024;
+        t.numSrcs = 1;
+        t.numDsts = 1;
+        t.numSubExecs = 8;
+        t.srcType.push_back(MEM_GPU_FINE);
+        t.dstType.push_back(MEM_GPU_FINE);
+        t.srcIndex.push_back(i);
+        t.dstIndex.push_back(j);
+        transfers.push_back(t);
+      }
+    }
+    TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
+    std::vector<std::pair<std::pair<int,int>, double>> fails;
+    int transferIdx = 0;
+    for (int i = 0; i < ev.numGpuDevices; i++) {
+      printf("."); fflush(stdout);
+      for (int j = 0; j < ev.numGpuDevices; j++) {
+        if (i == j) continue;
+        Transfer const& t = transfers[transferIdx];
+        if (t.transferBandwidth < a2aLimit) {
+          fails.push_back(std::make_pair(std::make_pair(i,j), t.transferBandwidth));
+        }
+        transferIdx++;
+      }
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
+      }
+    }
+  }
+
+  exit(hasFail ? 1 : 0);
+#endif
+}
--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"

-#define TB_VERSION "1.50"
+#define TB_VERSION "1.51"

 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -84,9 +84,12 @@ public:
  int gfxUnroll;         // GFX-kernel unroll factor
  int gfxWaveOrder;      // GFX-kernel wavefront ordering
  int hideEnv;           // Skip printing environment variable
+  int minNumVarSubExec;  // Minimum # of subexecutors to use for variable subExec Transfers
+  int maxNumVarSubExec;  // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
+  int numSubIterations;  // Number of subiterations to perform
  int numWarmups;        // Number of un-timed warmup iterations to perform
  int outputToCsv;       // Output in CSV format
  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
@@ -188,9 +191,12 @@ public:
    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
+    minNumVarSubExec  = GetEnvVar("MIN_VAR_SUBEXEC"     , 1);
+    maxNumVarSubExec  = GetEnvVar("MAX_VAR_SUBEXEC"     , 0);
    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
+    numSubIterations  = GetEnvVar("NUM_SUBITERATIONS"   , 1);
    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
@@ -299,6 +305,24 @@ public:
    }
    else fillPattern.clear();

+    // Figure out number of xccs per device
+    int maxNumXccs = 64;
+    xccIdsPerDevice.resize(numGpuDevices);
+    for (int i = 0; i < numGpuDevices; i++)
+    {
+      int* data;
+      HIP_CALL(hipSetDevice(i));
+      HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
+      CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
+      HIP_CALL(hipDeviceSynchronize());
+
+      xccIdsPerDevice[i].clear();
+      for (int j = 0; j < maxNumXccs; j++)
+        xccIdsPerDevice[i].insert(data[j]);
+
+      HIP_CALL(hipHostFree(data));
+    }
+
    // Check for CU mask
    cuMask.clear();
    char* cuMaskStr = getenv("CU_MASK");
@@ -308,6 +332,7 @@ public:
      printf("[WARN] CU_MASK is not supported in CUDA\n");
 #else
      std::vector<std::pair<int, int>> ranges;
+      int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
      int maxCU = 0;
      char* token = strtok(cuMaskStr, ",");
      while (token)
@@ -330,36 +355,22 @@ public:
        }
        token = strtok(NULL, ",");
      }
-      cuMask.resize(maxCU / 32 + 1, 0);
+      cuMask.resize(2 * numXccs, 0);

      for (auto range : ranges)
      {
        for (int i = range.first; i <= range.second; i++)
        {
-          cuMask[i / 32] |= (1 << (i % 32));
+          for (int x = 0; x < numXccs; x++)
+          {
+            int targetBit = i * numXccs + x;
+            cuMask[targetBit/32] |= (1<<(targetBit%32));
+          }
        }
      }
 #endif
    }

-    // Figure out number of xccs per device
-    int maxNumXccs = 64;
-    xccIdsPerDevice.resize(numGpuDevices);
-    for (int i = 0; i < numGpuDevices; i++)
-    {
-      int* data;
-      HIP_CALL(hipSetDevice(i));
-      HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
-      CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
-      HIP_CALL(hipDeviceSynchronize());
-
-      xccIdsPerDevice[i].clear();
-      for (int j = 0; j < maxNumXccs; j++)
-        xccIdsPerDevice[i].insert(data[j]);
-
-      HIP_CALL(hipHostFree(data));
-    }
-
    // Parse preferred XCC table (if provided
    prefXccTable.resize(numGpuDevices);
    for (int i = 0; i < numGpuDevices; i++)
@@ -429,6 +440,11 @@ public:
      printf("[ERROR] BLOCK_ORDER must be 0 (Sequential), 1 (Interleaved), or 2 (Random)\n");
      exit(1);
    }
+    if (minNumVarSubExec  < 1)
+    {
+      printf("[ERROR] Minimum number of subexecutors for variable subexector transfers must be at least 1\n");
+      exit(1);
+    }
    if (numWarmups < 0)
    {
      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
@@ -524,8 +540,10 @@ public:
    // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
    numCpusPerNuma.resize(numDetectedCpus);
    int const totalCpus = numa_num_configured_cpus();
-    for (int i = 0; i < totalCpus; i++)
-      numCpusPerNuma[numa_node_of_cpu(i)]++;
+    for (int i = 0; i < totalCpus; i++) {
+      int node = numa_node_of_cpu(i);
+      if (node >= 0) numCpusPerNuma[node]++;
+    }

    // Build array of wall clock rates per GPU device
    wallClockPerDeviceMhz.resize(numDetectedGpus);
@@ -583,9 +601,12 @@ public:
    printf(" GFX_SINGLE_TEAM        - Have subexecutors work together on full array instead of working on individual disjoint subarrays\n");
    printf(" GFX_WAVE_ORDER         - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
    printf(" HIDE_ENV               - Hide environment variable value listing\n");
+    printf(" MIN_VAR_SUBEXEC        - Minumum # of subexecutors to use for variable subExec Transfers\n");
+    printf(" MAX_VAR_SUBEXEC        - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
    printf(" NUM_GPU_DEVICES=X      - Restrict number of GPUs to X.  May not be greater than # detected HIP devices\n");
    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
+    printf(" NUM_SUBITERATIONS=S    - Perform S sub-iteration(s) per iteration. Must be non-negative\n");
    printf(" NUM_WARMUPS=W          - Perform W untimed warmup iteration(s) per test\n");
    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
@@ -649,6 +670,12 @@ public:
                                                                       gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
                                                                       gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
                                                                                           "CU,Wavefront,Unroll")));
+    PRINT_EV("MIN_VAR_SUBEXEC", minNumVarSubExec,
+             std::string("Using at least ") + std::to_string(minNumVarSubExec) + " subexecutor(s) for variable subExec tranfers");
+    PRINT_EV("MAX_VAR_SUBEXEC", maxNumVarSubExec,
+             maxNumVarSubExec ?
+             std::string("Using at most ") + std::to_string(maxNumVarSubExec) + " subexecutor(s) for variable subExec tranfers" :
+             "Using up to maximum device subexecutors for variable subExec tranfers");
    PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
             std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
    PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
@@ -656,6 +683,8 @@ public:
    PRINT_EV("NUM_ITERATIONS", numIterations,
             std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
             + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
+    PRINT_EV("NUM_SUBITERATIONS", numSubIterations,
+             std::string("Running ") + (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)) + " subiterations");
    PRINT_EV("NUM_WARMUPS", numWarmups,
             std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
    PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
@@ -828,36 +857,27 @@ public:
  std::string GetCuMaskDesc() const
  {
    std::vector<std::pair<int, int>> runs;
-
+    int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
    bool inRun = false;
    std::pair<int, int> curr;
    int used = 0;
-    for (int i = 0; i < cuMask.size(); i++)
-    {
-      for (int j = 0; j < 32; j++)
-      {
-        if (cuMask[i] & (1 << j))
-        {
-          used++;
-          if (!inRun)
-          {
-            inRun = true;
-            curr.first = i * 32 + j;
-          }
+    for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
+      if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
+        used++;
+        if (!inRun) {
+          inRun = true;
+          curr.first = targetBit / numXccs;
        }
-        else
-        {
-          if (inRun)
-          {
-            inRun = false;
-            curr.second = i * 32 + j - 1;
-            runs.push_back(curr);
-          }
+      } else {
+        if (inRun) {
+          inRun = false;
+          curr.second = targetBit / numXccs - 1;
+          runs.push_back(curr);
        }
      }
    }
    if (inRun)
-      curr.second = cuMask.size() * 32 - 1;
+      curr.second = (cuMask.size() * 32) / numXccs - 1;

    std::string result = "CUs used: (" + std::to_string(used) + ") ";
    for (int i = 0; i < runs.size(); i++)

--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
@@ -174,7 +174,7 @@ template <>           __device__ __forceinline__ float4 MemsetVal(){ return make

 template <int BLOCKSIZE, int UNROLL>
 __global__ void __launch_bounds__(BLOCKSIZE)
-  GpuReduceKernel(SubExecParam* params, int waveOrder)
+  GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
 {
  int64_t startCycle;
  if (threadIdx.x == 0) startCycle = GetTimestamp();
@@ -216,84 +216,88 @@ __global__ void __launch_bounds__(BLOCKSIZE)
  case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves;  teamStride2 = 1;      waveStride2 = nTeams; break;
  }

-  // First loop: Each wavefront in the team works on UNROLL float4s per thread
-  size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
-  size_t const loop1Limit  = numFloat4 / loop1Stride * loop1Stride;
-  {
-    float4 val[UNROLL];
-    if (numSrcs == 0)
+  int subIterations = 0;
+  while (1) {
+    // First loop: Each wavefront in the team works on UNROLL float4s per thread
+    size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
+    size_t const loop1Limit  = numFloat4 / loop1Stride * loop1Stride;
    {
-      #pragma unroll
-      for (int u = 0; u < UNROLL; u++)
-        val[u] = MemsetVal<float4>();
-    }
-
-    for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
-    {
-      // Read sources into memory and accumulate in registers
-      if (numSrcs)
-      {
-        for (int u = 0; u < UNROLL; u++)
-          val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
-        for (int s = 1; s < numSrcs; s++)
-          for (int u = 0; u < UNROLL; u++)
-            val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
-      }
-
-      // Write accumulation to all outputs
-      for (int d = 0; d < numDsts; d++)
-      {
+      float4 val[UNROLL];
+      if (numSrcs == 0) {
        #pragma unroll
        for (int u = 0; u < UNROLL; u++)
-          dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
+          val[u] = MemsetVal<float4>();
      }
-    }
-  }
-
-  // Second loop: Deal with remaining float4s
-  {
-    if (loop1Limit < numFloat4)
-    {
-      float4 val;
-      if (numSrcs == 0) val = MemsetVal<float4>();

-      size_t const loop2Stride = nTeams * nWaves * warpSize;
-      for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
+      for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
      {
+        // Read sources into memory and accumulate in registers
        if (numSrcs)
        {
-          val = srcFloat4[0][idx];
+          for (int u = 0; u < UNROLL; u++)
+            val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
          for (int s = 1; s < numSrcs; s++)
-            val += srcFloat4[s][idx];
+            for (int u = 0; u < UNROLL; u++)
+              val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
        }

+        // Write accumulation to all outputs
        for (int d = 0; d < numDsts; d++)
-          dstFloat4[d][idx] = val;
+        {
+          #pragma unroll
+          for (int u = 0; u < UNROLL; u++)
+            dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
+        }
      }
    }
-  }

-  // Third loop; Deal with remaining floats
-  {
-    if (numFloat4 * 4 < p.N)
+    // Second loop: Deal with remaining float4s
    {
-      float val;
-      if (numSrcs == 0) val = MemsetVal<float>();
-
-      size_t const loop3Stride = nTeams * nWaves * warpSize;
-      for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
+      if (loop1Limit < numFloat4)
      {
-        if (numSrcs)
+        float4 val;
+        if (numSrcs == 0) val = MemsetVal<float4>();
+
+        size_t const loop2Stride = nTeams * nWaves * warpSize;
+        for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
        {
-          val = p.src[0][idx];
-          for (int s = 1; s < numSrcs; s++)
-            val += p.src[s][idx];
+          if (numSrcs)
+          {
+            val = srcFloat4[0][idx];
+            for (int s = 1; s < numSrcs; s++)
+              val += srcFloat4[s][idx];
+          }
+
+          for (int d = 0; d < numDsts; d++)
+            dstFloat4[d][idx] = val;
        }
+      }
+    }

-        for (int d = 0; d < numDsts; d++)
-          p.dst[d][idx] = val;
+    // Third loop; Deal with remaining floats
+    {
+      if (numFloat4 * 4 < p.N)
+      {
+        float val;
+        if (numSrcs == 0) val = MemsetVal<float>();
+
+        size_t const loop3Stride = nTeams * nWaves * warpSize;
+        for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
+        {
+          if (numSrcs)
+          {
+            val = p.src[0][idx];
+            for (int s = 1; s < numSrcs; s++)
+              val += p.src[s][idx];
+          }
+
+          for (int d = 0; d < numDsts; d++)
+            p.dst[d][idx] = val;
+        }
      }
    }
+
+    if (++subIterations == numSubIterations) break;
  }

  // Wait for all threads to finish
@@ -308,7 +312,7 @@ __global__ void __launch_bounds__(BLOCKSIZE)
  }
 }

-typedef void (*GpuKernelFuncPtr)(SubExecParam*, int);
+typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);

 #define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
  {GpuReduceKernel<BLOCKSIZE, 1>,  \

--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
@@ -158,6 +158,25 @@ struct ExecutorInfo
  double totalTime;
 };

+struct ExeResult
+{
+  double bandwidthGbs;
+  double durationMsec;
+  double sumBandwidthGbs;
+  size_t totalBytes;
+  std::vector<int> transferIdx;
+};
+
+struct TestResults
+{
+  size_t numTimedIterations;
+  size_t totalBytesTransferred;
+  double totalBandwidthCpu;
+  double totalDurationMsec;
+  double overheadMsec;
+  std::map<std::pair<ExeType, int>, ExeResult> exeResults;
+};
+
 typedef std::pair<ExeType, int> Executor;
 typedef std::map<Executor, ExecutorInfo> TransferMap;

@@ -179,7 +198,8 @@ void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transf
 void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
                      std::vector<Transfer>& transfers, bool verbose = true,
                      double* totalBandwidthCpu = nullptr);
-
+TestResults ExecuteTransfersImpl(EnvVars const& ev, std::vector<Transfer>& transfers);
+void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results);
 void EnablePeerAccess(int const deviceId, int const peerDeviceId);
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
 void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
@@ -192,6 +212,7 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
 void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
 void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
 void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
+void RunHealthCheck(EnvVars ev);

 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);