Merge branch 'ROCm:develop' into swraw/docs

513ce1e3 · Swati Rawat · GitHub · ebad0b36 · fb713d03 · 513ce1e3
Unverified Commit 513ce1e3 authored Dec 11, 2024 by Swati Rawat Committed by GitHub Dec 11, 2024
8 changed files
--- a/src/client/Presets/Sweep.hpp
+++ b/src/client/Presets/Sweep.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
+{
+  fprintf(fp, "# Test %d\n", testNum);
+  fprintf(fp, "%d", -1 * (int)transfers.size());
+  for (auto const& transfer : transfers)
+  {
+    fprintf(fp, " (%s->%c%d->%s %d %lu)",
+            MemDevicesToStr(transfer.srcs).c_str(),
+            ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex,
+            MemDevicesToStr(transfer.dsts).c_str(),
+            transfer.numSubExecs,
+            transfer.numBytes);
+  }
+  fprintf(fp, "\n");
+  fflush(fp);
+}
+
+void SweepPreset(EnvVars&           ev,
+                 size_t      const  numBytesPerTransfer,
+                 std::string const  presetName)
+{
+  bool const isRandom = (presetName == "rsweep");
+
+  int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars and set defaults
+  int         continueOnErr  = EnvVars::GetEnvVar("CONTINUE_ON_ERROR"   , 0);
+  int         numCpuDevices  = EnvVars::GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
+  int         numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE"          , 4);
+  int         numGpuDevices  = EnvVars::GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
+  int         numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE"          , 4);
+  std::string sweepDst       = EnvVars::GetEnvVar("SWEEP_DST"           , "CG");
+  std::string sweepExe       = EnvVars::GetEnvVar("SWEEP_EXE"           , "CDG");
+  int         sweepMax       = EnvVars::GetEnvVar("SWEEP_MAX"           , 24);
+  int         sweepMin       = EnvVars::GetEnvVar("SWEEP_MIN"           , 1);
+  int         sweepRandBytes = EnvVars::GetEnvVar("SWEEP_RAND_BYTES"    , 0);
+  int         sweepSeed      = EnvVars::GetEnvVar("SWEEP_SEED"          , time(NULL));
+  std::string sweepSrc       = EnvVars::GetEnvVar("SWEEP_SRC"           , "CG");
+  int         sweepTestLimit = EnvVars::GetEnvVar("SWEEP_TEST_LIMIT"    , 0);
+  int         sweepTimeLimit = EnvVars::GetEnvVar("SWEEP_TIME_LIMIT"    , 0);
+  int         sweepXgmiMin   = EnvVars::GetEnvVar("SWEEP_XGMI_MIN"      , 0);
+  int         sweepXgmiMax   = EnvVars::GetEnvVar("SWEEP_XGMI_MAX"      , -1);
+
+  auto generator = new std::default_random_engine(sweepSeed);
+
+  // Display env var settings
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    int outputToCsv = ev.outputToCsv;
+    if (!outputToCsv) printf("[Sweep Related]\n");
+    ev.Print("CONTINUE_ON_ERROR", continueOnErr,    continueOnErr ? "Continue on mismatch error" : "Stop after first error");
+    ev.Print("NUM_CPU_DEVICES",   numCpuDevices,    "Using %d CPUs", numCpuDevices);
+    ev.Print("NUM_CPU_SE",        numCpuSubExecs,   "Using %d CPU threads per CPU executed Transfer", numCpuSubExecs);
+    ev.Print("NUM_GPU_DEVICES",   numGpuDevices,    "Using %d GPUs", numGpuDevices);
+    ev.Print("NUM_GPU_SE",        numGpuSubExecs,   "Using %d subExecutors/CUs per GPU executed Transfer", numGpuSubExecs);
+    ev.Print("SWEEP_DST",         sweepDst.c_str(), "Destination Memory Types to sweep");
+    ev.Print("SWEEP_EXE",         sweepExe.c_str(), "Executor Types to sweep");
+    ev.Print("SWEEP_MAX",         sweepMax,         "Max simultaneous transfers (0 = no limit)");
+    ev.Print("SWEEP_MIN",         sweepMin,         "Min simultaenous transfers");
+    ev.Print("SWEEP_RAND_BYTES",  sweepRandBytes,   "Using %s number of bytes per Transfer", (sweepRandBytes ? "random" : "constant"));
+    ev.Print("SWEEP_SEED",        sweepSeed,        "Random seed set to %d", sweepSeed);
+    ev.Print("SWEEP_SRC",         sweepSrc.c_str(), "Source Memory Types to sweep");
+    ev.Print("SWEEP_TEST_LIMIT",  sweepTestLimit,   "Max number of tests to run during sweep (0 = no limit)");
+    ev.Print("SWEEP_TIME_LIMIT",  sweepTimeLimit,   "Max number of seconds to run sweep for  (0 = no limit)");
+    ev.Print("SWEEP_XGMI_MAX",    sweepXgmiMax,     "Max number of XGMI hops for Transfers  (-1 = no limit)");
+    ev.Print("SWEEP_XGMI_MIN",    sweepXgmiMin,     "Min number of XGMI hops for Transfers");
+    printf("\n");
+  }
+
+  // Validate env vars
+  for (auto ch : sweepSrc) {
+    if (!strchr(MemTypeStr, ch)) {
+      printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
+      exit(1);
+    }
+    if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) {
+      printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
+      exit(1);
+    }
+  }
+
+  for (auto ch : sweepDst) {
+    if (!strchr(MemTypeStr, ch)) {
+      printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
+      exit(1);
+    }
+    if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) {
+      printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
+      exit(1);
+    }
+  }
+
+  for (auto ch : sweepExe) {
+    if (!strchr(ExeTypeStr, ch)) {
+      printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
+      exit(1);
+    }
+    if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) {
+      printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
+      exit(1);
+    }
+  }
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
+  std::vector<ExeDevice> exeList;
+  for (auto exe : sweepExe) {
+    ExeType exeType;
+    CharToExeType(exe, exeType);
+    if (IsGpuExeType(exeType)) {
+      for (int exeIndex = 0; exeIndex < numGpuDevices; ++exeIndex)
+        exeList.push_back({exeType, exeIndex});
+    }
+    else if (IsCpuExeType(exeType)) {
+      for (int exeIndex = 0; exeIndex < numCpuDevices; ++exeIndex) {
+        // Skip NUMA nodes that have no CPUs (e.g. CXL)
+        if (TransferBench::GetNumSubExecutors({EXE_CPU, exeIndex}) == 0) continue;
+        exeList.push_back({exeType, exeIndex});
+      }
+    }
+  }
+  int numExes = exeList.size();
+
+  std::vector<MemDevice> srcList;
+  for (auto src : sweepSrc) {
+    MemType srcType;
+    CharToMemType(src, srcType);
+    int const numDevices = (srcType == MEM_NULL) ? 1 : IsGpuMemType(srcType) ? numGpuDevices : numCpuDevices;
+
+    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
+      srcList.push_back({srcType, srcIndex});
+  }
+  int numSrcs = srcList.size();
+
+
+  std::vector<MemDevice> dstList;
+  for (auto dst : sweepDst) {
+    MemType dstType;
+    CharToMemType(dst, dstType);
+    int const numDevices = (dstType == MEM_NULL) ? 1 : IsGpuMemType(dstType) ? numGpuDevices : numCpuDevices;
+
+    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
+      dstList.push_back({dstType, dstIndex});
+  }
+  int numDsts = dstList.size();
+
+  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
+  struct TransferInfo
+  {
+    MemDevice srcMem;
+    ExeDevice exeDevice;
+    MemDevice dstMem;
+  };
+
+  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
+  bool const useXgmiOnly = (sweepXgmiMin > 0 || sweepXgmiMax > 0);
+
+  std::vector<TransferInfo> possibleTransfers;
+  TransferInfo tinfo;
+  for (int i = 0; i < numExes; ++i) {
+    // Skip CPU executors if XGMI link must be used
+    if (useXgmiOnly && !IsGpuExeType(exeList[i].exeType)) continue;
+    tinfo.exeDevice = exeList[i];
+
+    bool isXgmiSrc  = false;
+    int  numHopsSrc = 0;
+    for (int j = 0; j < numSrcs; ++j) {
+      if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(srcList[j].memType)) {
+        if (exeList[i].exeIndex != srcList[j].memIndex) {
+#if defined(__NVCC__)
+          isXgmiSrc = false;
+#else
+          uint32_t exeToSrcLinkType, exeToSrcHopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
+                                                srcList[j].memIndex,
+                                                &exeToSrcLinkType,
+                                                &exeToSrcHopCount));
+          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
+#endif
+        } else {
+          isXgmiSrc = true;
+          numHopsSrc = 0;
+        }
+
+        // Skip this SRC if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiSrc) continue;
+
+        // Skip this SRC if XGMI distance is already past limit
+        if (sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > sweepXgmiMax) continue;
+      } else if (srcList[j].memType != MEM_NULL && useXgmiOnly) continue;
+
+      tinfo.srcMem = srcList[j];
+
+      bool isXgmiDst = false;
+      int  numHopsDst = 0;
+      for (int k = 0; k < numDsts; ++k) {
+        if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(dstList[k].memType)) {
+          if (exeList[i].exeIndex != dstList[k].memIndex) {
+#if defined(__NVCC__)
+            isXgmiSrc = false;
+#else
+            uint32_t exeToDstLinkType, exeToDstHopCount;
+            HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
+                                                  dstList[k].memIndex,
+                                                  &exeToDstLinkType,
+                                                  &exeToDstHopCount));
+            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+            if (isXgmiDst) numHopsDst = exeToDstHopCount;
+#endif
+          } else {
+            isXgmiDst = true;
+            numHopsDst = 0;
+          }
+        }
+
+        // Skip this DST if it is not XGMI but only XGMI links may be used
+        if (dstList[k].memType != MEM_NULL && useXgmiOnly && !isXgmiDst) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
+        if (sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < sweepXgmiMin)) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
+        if (sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > sweepXgmiMax) continue;
+
+#if defined(__NVCC__)
+        // Skip CPU executors on GPU memory on NVIDIA platform
+        if (IsCpuExeType(exeList[i].exeType) && (IsGpuMemType(dstList[j].memType) || IsGpuMemType(dstList[k].memType)))
+          continue;
+#endif
+
+        tinfo.dstMem = dstList[k];
+
+        // Skip if there is no src and dst
+        if (tinfo.srcMem.memType == MEM_NULL && tinfo.dstMem.memType == MEM_NULL) continue;
+
+        possibleTransfers.push_back(tinfo);
+      }
+    }
+  }
+
+  int const numPossible = (int)possibleTransfers.size();
+  int maxParallelTransfers = (sweepMax == 0 ? numPossible : sweepMax);
+
+  if (sweepMin > numPossible) {
+    printf("No valid test configurations exist\n");
+    return;
+  }
+
+  if (ev.outputToCsv) {
+    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
+           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
+  }
+
+  int numTestsRun = 0;
+  int M = sweepMin;
+  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
+  std::uniform_int_distribution<int> distribution(sweepMin, maxParallelTransfers);
+
+  // Log sweep to configuration file
+  FILE *fp = fopen("lastSweep.cfg", "w");
+  if (!fp) {
+    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
+    exit(1);
+  }
+
+  // Create bitmask of numPossible triplets, of which M will be chosen
+  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
+  auto cpuStart = std::chrono::high_resolution_clock::now();
+  while (1)  {
+    if (isRandom) {
+      // Pick random number of simultaneous transfers to execute
+      // NOTE: This currently skews distribution due to some #s having more possibilities than others
+      M = distribution(*generator);
+
+      // Generate a random bitmask
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+      std::shuffle(bitmask.begin(), bitmask.end(), *generator);
+    }
+
+    // Convert bitmask to list of Transfers
+    std::vector<Transfer> transfers;
+    for (int value = 0; value < numPossible; ++value) {
+      if (bitmask[value]) {
+        // Convert integer value to (SRC->EXE->DST) triplet
+        Transfer transfer;
+        if (possibleTransfers[value].srcMem.memType != MEM_NULL)
+          transfer.srcs.push_back(possibleTransfers[value].srcMem);
+        transfer.exeDevice      = possibleTransfers[value].exeDevice;
+        if (possibleTransfers[value].dstMem.memType != MEM_NULL)
+          transfer.dsts.push_back(possibleTransfers[value].dstMem);
+        transfer.exeSubIndex    = -1;
+        transfer.numSubExecs    = IsGpuExeType(transfer.exeDevice.exeType) ? numGpuSubExecs : numCpuSubExecs;
+        transfer.numBytes       = sweepRandBytes ? randSize(*generator) * sizeof(float) : numBytesPerTransfer;
+        transfers.push_back(transfer);
+      }
+    }
+
+    LogTransfers(fp, ++numTestsRun, transfers);
+
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      if (!continueOnErr) exit(1);
+    } else {
+      PrintResults(ev, numTestsRun, transfers, results);
+    }
+
+    // Check for test limit
+    if (numTestsRun == sweepTestLimit) {
+      printf("Test limit reached\n");
+      break;
+    }
+
+    // Check for time limit
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+    if (sweepTimeLimit && totalCpuTime > sweepTimeLimit) {
+      printf("Time limit exceeded\n");
+      break;
+    }
+
+    // Increment bitmask if not random sweep
+    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end())) {
+      M++;
+      // Check for completion
+      if (M > maxParallelTransfers) {
+        printf("Sweep complete\n");
+        break;
+      }
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+    }
+  }
+  fclose(fp);
+}
--- a/src/client/Topology.hpp
+++ b/src/client/Topology.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "TransferBench.hpp"
+
+static int RemappedCpuIndex(int origIdx)
+{
+  static std::vector<int> remappingCpu;
+
+  // Build CPU remapping on first use
+  // Skip numa nodes that are not configured
+  if (remappingCpu.empty()) {
+    for (int node = 0; node <= numa_max_node(); node++)
+      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
+        remappingCpu.push_back(node);
+  }
+  return remappingCpu[origIdx];
+}
+
+void DisplayTopology(bool outputToCsv)
+{
+  int numCpus = TransferBench::GetNumExecutors(EXE_CPU);
+  int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  char sep = (outputToCsv ? ',' : '|');
+
+  if (outputToCsv) {
+    printf("NumCpus,%d\n", numCpus);
+    printf("NumGpus,%d\n", numGpus);
+  } else {
+    printf("\nDetected Topology:\n");
+    printf("==================\n");
+    printf("  %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1);
+    printf("  %d GPU device(s)\n", numGpus);
+  }
+
+  // Print out detected CPU topology
+  printf("\n            %c", sep);
+  for (int j = 0; j < numCpus; j++)
+    printf("NUMA %02d%c", j, sep);
+  printf(" #Cpus %c Closest GPU(s)\n", sep);
+
+  if (!outputToCsv) {
+    printf("------------+");
+    for (int j = 0; j <= numCpus; j++)
+      printf("-------+");
+    printf("---------------\n");
+  }
+
+  for (int i = 0; i < numCpus; i++) {
+    int nodeI = RemappedCpuIndex(i);
+    printf("NUMA %02d (%02d)%c", i, nodeI, sep);
+    for (int j = 0; j < numCpus; j++) {
+      int nodeJ = RemappedCpuIndex(j);
+      int numaDist = numa_distance(nodeI, nodeJ);
+      printf(" %5d %c", numaDist, sep);
+    }
+
+    int numCpuCores = 0;
+    for (int j = 0; j < numa_num_configured_cpus(); j++)
+      if (numa_node_of_cpu(j) == nodeI) numCpuCores++;
+    printf(" %5d %c", numCpuCores, sep);
+
+    for (int j = 0; j < numGpus; j++) {
+      if (TransferBench::GetClosestCpuNumaToGpu(j) == nodeI) {
+        printf(" %d", j);
+      }
+    }
+    printf("\n");
+  }
+  printf("\n");
+
+  // Print out detected GPU topology
+
+#if defined(__NVCC__)
+  for (int i = 0; i < numGpus; i++) {
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, i));
+    printf(" GPU %02d | %s\n", i, prop.name);
+  }
+  // No further topology detection done for NVIDIA platforms
+  return;
+#else
+  // Print headers
+  if (!outputToCsv) {
+    printf("        |");
+    for (int j = 0; j < numGpus; j++) {
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, j));
+      std::string fullName = prop.gcnArchName;
+      std::string archName = fullName.substr(0, fullName.find(':'));
+      printf(" %6s |", archName.c_str());
+    }
+    printf("\n");
+  }
+
+  printf("        %c", sep);
+  for (int j = 0; j < numGpus; j++)
+    printf(" GPU %02d %c", j, sep);
+  printf(" PCIe Bus ID  %c #CUs %c NUMA %c #DMA %c #XCC\n", sep, sep, sep, sep);
+
+  if (!outputToCsv) {
+    for (int j = 0; j <= numGpus; j++)
+      printf("--------+");
+    printf("--------------+------+------+------+------\n");
+  }
+
+  // Loop over each GPU device
+  for (int i = 0; i < numGpus; i++) {
+    printf(" GPU %02d %c", i, sep);
+
+    // Print off link information
+    for (int j = 0; j < numGpus; j++) {
+      if (i == j) {
+        printf("    N/A %c", sep);
+      } else {
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+        printf(" %s-%d %c",
+               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
+               hopCount, sep);
+      }
+    }
+
+    char pciBusId[20];
+    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
+    printf(" %11s %c %4d %c %4d %c %4d %c %4d\n",
+           pciBusId, sep,
+           TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}), sep,
+           TransferBench::GetClosestCpuNumaToGpu(i), sep,
+           TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}), sep,
+           TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}));
+  }
+#endif
+}
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
+/*
+Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include <cstring>
+#include <future>
+#include <map>
+#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
+#include <numaif.h>
+#include <set>
+#include <sstream>
+#include <stdarg.h>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+
+#if defined(__NVCC__)
+#include <cuda_runtime.h>
+#else
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+#endif
+
+namespace TransferBench
+{
+  using std::map;
+  using std::pair;
+  using std::set;
+  using std::vector;
+
+  constexpr char VERSION[] = "1.58";
+
+  /**
+   * Enumeration of supported Executor types
+   *
+   * @note The Executor is the device used to perform a Transfer
+   * @note IBVerbs executor is currently not implemented yet
+   */
+  enum ExeType
+  {
+    EXE_CPU          = 0,                       ///<  CPU executor              (subExecutor = CPU thread)
+    EXE_GPU_GFX      = 1,                       ///<  GPU kernel-based executor (subExecutor = threadblock/CU)
+    EXE_GPU_DMA      = 2,                       ///<  GPU SDMA executor         (subExecutor = not supported)
+    EXE_IBV          = 3,                       ///<  IBVerbs executor          (subExecutor = queue pair)
+  };
+  char const ExeTypeStr[5] = "CGDI";
+  inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; }
+  inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA; }
+
+  /**
+   * A ExeDevice defines a specific Executor
+   */
+  struct ExeDevice
+  {
+    ExeType exeType;                            ///< Executor type
+    int32_t exeIndex;                           ///< Executor index
+
+    bool operator<(ExeDevice const& other) const {
+      return (exeType < other.exeType) || (exeType == other.exeType && exeIndex < other.exeIndex);
+    }
+  };
+
+  /**
+   * Enumeration of supported memory types
+   *
+   * @note These are possible types of memory to be used as sources/destinations
+   */
+  enum MemType
+  {
+    MEM_CPU          = 0,                       ///< Coarse-grained pinned CPU memory
+    MEM_GPU          = 1,                       ///< Coarse-grained global GPU memory
+    MEM_CPU_FINE     = 2,                       ///< Fine-grained pinned CPU memory
+    MEM_GPU_FINE     = 3,                       ///< Fine-grained global GPU memory
+    MEM_CPU_UNPINNED = 4,                       ///< Unpinned CPU memory
+    MEM_NULL         = 5,                       ///< NULL memory - used for empty
+    MEM_MANAGED      = 6                        ///< Managed memory
+  };
+  char const MemTypeStr[8] = "CGBFUNM";
+  inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); }
+  inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
+
+  /**
+   * A MemDevice indicates a memory type on a specific device
+   */
+  struct MemDevice
+  {
+    MemType memType;                            ///< Memory type
+    int32_t memIndex;                           ///< Device index
+
+    bool operator<(MemDevice const& other) const {
+      return (memType < other.memType) || (memType == other.memType && memIndex < other.memIndex);
+    }
+  };
+
+  /**
+   * A Transfer adds together data from zero or more sources then writes the sum to zero or more desintations
+   */
+  struct Transfer
+  {
+    size_t            numBytes    = (1<<26);    ///< # of bytes to Transfer
+    vector<MemDevice> srcs        = {};         ///< List of source memory devices
+    vector<MemDevice> dsts        = {};         ///< List of destination memory devices
+    ExeDevice         exeDevice   = {};         ///< Executor to use
+    int32_t           exeDstIndex = -1;         ///< Destination executor index (for RDMA executor only)
+    int32_t           exeSubIndex = -1;         ///< Executor subindex
+    int               numSubExecs = 0;          ///< Number of subExecutors to use for this Transfer
+  };
+
+  /**
+   * General options
+   */
+  struct GeneralOptions
+  {
+    int numIterations      = 10;                ///< # of timed iterations to perform. If negative, run for -numIterations seconds instead
+    int numSubIterations   = 1;                 ///< # of sub-iterations per iteration
+    int numWarmups         = 3;                 ///< Number of un-timed warmup iterations to perform
+    int recordPerIteration = 0;                 ///< Record per-iteration timing information
+    int useInteractive     = 0;                 ///< Pause for user-input before starting transfer loop
+  };
+
+  /**
+   * Data options
+   */
+  struct DataOptions
+  {
+    int           alwaysValidate   = 0;         ///< Validate after each iteration instead of once at end
+    int           blockBytes       = 256;       ///< Each subexecutor works on a multiple of this many bytes
+    int           byteOffset       = 0;         ///< Byte-offset for memory allocations
+    vector<float> fillPattern      = {};        ///< Pattern of floats used to fill source data
+    int           validateDirect   = 0;         ///< Validate GPU results directly instead of copying to host
+    int           validateSource   = 0;         ///< Validate src GPU memory immediately after preparation
+  };
+
+  /**
+   * DMA Executor options
+   */
+  struct DmaOptions
+  {
+    int useHipEvents = 1;                       ///< Use HIP events for timing DMA Executor
+    int useHsaCopy   = 0;                       ///< Use HSA copy instead of HIP copy to perform DMA
+  };
+
+  /**
+   * GFX Executor options
+   */
+  struct GfxOptions
+  {
+    int                 blockSize      = 256;   ///< Size of each threadblock (must be multiple of 64)
+    vector<uint32_t>    cuMask         = {};    ///< Bit-vector representing the CU mask
+    vector<vector<int>> prefXccTable   = {};    ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
+    int                 unrollFactor   = 4;     ///< GFX-kernel unroll factor
+    int                 useHipEvents   = 1;     ///< Use HIP events for timing GFX Executor
+    int                 useMultiStream = 0;     ///< Use multiple streams for GFX
+    int                 useSingleTeam  = 0;     ///< Team all subExecutors across the data array
+    int                 waveOrder      = 0;     ///< GFX-kernel wavefront ordering
+  };
+
+  /**
+   * Configuration options for performing Transfers
+   */
+  struct ConfigOptions
+  {
+    GeneralOptions general;                     ///< General options
+    DataOptions    data;                        ///< Data options
+
+    GfxOptions     gfx;                         ///< GFX executor options
+    DmaOptions     dma;                         ///< DMA executor options
+  };
+
+  /**
+   * Enumeration of possible error types
+   */
+  enum ErrType
+  {
+    ERR_NONE  = 0,                              ///< No errors
+    ERR_WARN  = 1,                              ///< Warning - results may not be accurate
+    ERR_FATAL = 2,                              ///< Fatal error - results are invalid
+  };
+
+  /**
+   * ErrResult consists of error type and error message
+   */
+  struct ErrResult
+  {
+    ErrType     errType;                        ///< Error type
+    std::string errMsg;                         ///< Error details
+
+    ErrResult() = default;
+#if defined(__NVCC__)
+    ErrResult(cudaError_t  err);
+#else
+    ErrResult(hipError_t   err);
+    ErrResult(hsa_status_t err);
+#endif
+    ErrResult(ErrType      err);
+    ErrResult(ErrType      errType, const char* format, ...);
+  };
+
+  /**
+   * Results for a single Executor
+   */
+  struct ExeResult
+  {
+    size_t      numBytes;                       ///< Total bytes transferred by this Executor
+    double      avgDurationMsec;                ///< Averaged duration for all the Transfers for this Executor
+    double      avgBandwidthGbPerSec;           ///< Average bandwidth for this Executor
+    double      sumBandwidthGbPerSec;           ///< Naive sum of individual Transfer average bandwidths
+    vector<int> transferIdx;                    ///< Indicies of Transfers this Executor executed
+  };
+
+  /**
+   * Results for a single Transfer
+   */
+  struct TransferResult
+  {
+    size_t numBytes;                            ///< Number of bytes transferred by this Transfer
+    double avgDurationMsec;                     ///< Duration for this Transfer, averaged over all timed iterations
+    double avgBandwidthGbPerSec;                ///< Bandwidth for this Transfer based on averaged duration
+
+    // Only filled in if recordPerIteration = 1
+    vector<double> perIterMsec;                 ///< Duration for each individual iteration
+    vector<set<pair<int,int>>> perIterCUs;      ///< GFX-Executor only. XCC:CU used per iteration
+  };
+
+  /**
+   * TestResults contain timing results for a set of Transfers as a group as well as per Executor and per Transfer
+   * timing information
+   */
+  struct TestResults
+  {
+    int    numTimedIterations;                  ///< Number of iterations executed
+    size_t totalBytesTransferred;               ///< Total bytes transferred per iteration
+    double avgTotalDurationMsec;                ///< Wall-time (msec) to finish all Transfers (averaged across all timed iterations)
+    double avgTotalBandwidthGbPerSec;           ///< Bandwidth based on all Transfers and average wall time
+    double overheadMsec;                        ///< Difference between total wall time and slowest executor
+
+    map<ExeDevice, ExeResult> exeResults;       ///< Per Executor results
+    vector<TransferResult>    tfrResults;       ///< Per Transfer results
+    vector<ErrResult>         errResults;       ///< List of any errors/warnings that occurred
+  };
+
+  /**
+   * Run a set of Transfers
+   *
+   * @param[in]  config     Configuration options
+   * @param[in]  transfers  Set of Transfers to execute
+   * @param[out] results    Timing results
+   * @returns true if and only if Transfers were run successfully without any fatal errors
+   */
+  bool RunTransfers(ConfigOptions    const& config,
+                    vector<Transfer> const& transfers,
+                    TestResults&            results);
+
+  /**
+   * Enumeration of implementation attributes
+   */
+  enum IntAttribute
+  {
+    ATR_GFX_MAX_BLOCKSIZE,                      ///< Maximum blocksize for GFX executor
+    ATR_GFX_MAX_UNROLL,                         ///< Maximum unroll factor for GFX executor
+  };
+
+  enum StrAttribute
+  {
+    ATR_SRC_PREP_DESCRIPTION                    ///< Description of how source memory is prepared
+  };
+
+  /**
+   * Query attributes (integer)
+   *
+   * @note This allows querying of implementation information such as limits
+   *
+   * @param[in] attribute   Attribute to query
+   * @returns Value of the attribute
+   */
+  int GetIntAttribute(IntAttribute attribute);
+
+  /**
+   * Query attributes (string)
+   *
+   * @note This allows query of implementation details such as limits
+   *
+   * @param[in] attrtibute Attribute to query
+   * @returns Value of the attribute
+   */
+  std::string GetStrAttribute(StrAttribute attribute);
+
+  /**
+   * Returns information about number of available available Executors
+   *
+   * @param[in] exeType    Executor type to query
+   * @returns Number of detected Executors of exeType
+   */
+  int GetNumExecutors(ExeType exeType);
+
+  /**
+   * Returns the number of possible Executor subindices
+   *
+   * @note For CPU, this is 0
+   * @note For GFX, this refers to the number of XCDs
+   * @note For DMA, this refers to the number of DMA engines
+   *
+   * @param[in] exeDevice The specific Executor to query
+   * @returns Number of detected executor subindices
+   */
+  int GetNumExecutorSubIndices(ExeDevice exeDevice);
+
+  /**
+   * Returns number of subExecutors for a given ExeDevice
+   *
+   * @param[in] exeDevice   The specific Executor to query
+   * @returns Number of detected subExecutors for the given ExePair
+   */
+  int GetNumSubExecutors(ExeDevice exeDevice);
+
+  /**
+   * Returns the index of the NUMA node closest to the given GPU
+   *
+   * @param[in] gpuIndex Index of the GPU to query
+   * @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect
+   */
+  int GetClosestCpuNumaToGpu(int gpuIndex);
+
+  /**
+   * Helper function to parse a line containing Transfers into a vector of Transfers
+   *
+   * @param[in]  str       String containing description of Transfers
+   * @param[out] transfers List of Transfers described by 'str'
+   * @returns Information about any error that may have occured
+   */
+  ErrResult ParseTransfers(std::string str,
+                           std::vector<Transfer>& transfers);
+
+};
+//==========================================================================================
+// End of TransferBench API
+//==========================================================================================
+
+// Redefinitions for CUDA compatibility
+//==========================================================================================
+#if defined(__NVCC__)
+
+  // ROCm specific
+  #define wall_clock64                                       clock64
+  #define gcnArchName                                        name
+
+  // Datatypes
+  #define hipDeviceProp_t                                    cudaDeviceProp
+  #define hipError_t                                         cudaError_t
+  #define hipEvent_t                                         cudaEvent_t
+  #define hipStream_t                                        cudaStream_t
+
+  // Enumerations
+  #define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
+  #define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
+  #define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
+  #define hipFuncCachePreferShared                           cudaFuncCachePreferShared
+  #define hipMemcpyDefault                                   cudaMemcpyDefault
+  #define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
+  #define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
+  #define hipSuccess                                         cudaSuccess
+
+  // Functions
+  #define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
+  #define hipDeviceEnablePeerAccess                          cudaDeviceEnablePeerAccess
+  #define hipDeviceGetAttribute                              cudaDeviceGetAttribute
+  #define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
+  #define hipDeviceSetCacheConfig                            cudaDeviceSetCacheConfig
+  #define hipDeviceSynchronize                               cudaDeviceSynchronize
+  #define hipEventCreate                                     cudaEventCreate
+  #define hipEventDestroy                                    cudaEventDestroy
+  #define hipEventElapsedTime                                cudaEventElapsedTime
+  #define hipEventRecord                                     cudaEventRecord
+  #define hipFree                                            cudaFree
+  #define hipGetDeviceCount                                  cudaGetDeviceCount
+  #define hipGetDeviceProperties                             cudaGetDeviceProperties
+  #define hipGetErrorString                                  cudaGetErrorString
+  #define hipHostFree                                        cudaFreeHost
+  #define hipHostMalloc                                      cudaMallocHost
+  #define hipMalloc                                          cudaMalloc
+  #define hipMallocManaged                                   cudaMallocManaged
+  #define hipMemcpy                                          cudaMemcpy
+  #define hipMemcpyAsync                                     cudaMemcpyAsync
+  #define hipMemset                                          cudaMemset
+  #define hipMemsetAsync                                     cudaMemsetAsync
+  #define hipSetDevice                                       cudaSetDevice
+  #define hipStreamCreate                                    cudaStreamCreate
+  #define hipStreamDestroy                                   cudaStreamDestroy
+  #define hipStreamSynchronize                               cudaStreamSynchronize
+
+  // Define float4 addition operator for NVIDIA platform
+  __device__ inline float4& operator +=(float4& a, const float4& b)
+  {
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+    return a;
+  }
+#endif
+
+// Helper macro functions
+//==========================================================================================
+
+// Macro for collecting CU/SM GFX kernel is running on
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
+#define GetHwId(hwId) hwId = 0
+#elif defined(__NVCC__)
+#define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
+#else
+#define GetHwId(hwId) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId));
+#endif
+
+// Macro for collecting XCC GFX kernel is running on
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
+#else
+#define GetXccId(val) val = 0
+#endif
+
+// Error check macro (NOTE: This will return even for ERR_WARN)
+#define ERR_CHECK(cmd)            \
+  do {                            \
+    ErrResult err = (cmd);        \
+    if (err.errType != ERR_NONE)  \
+      return err;                 \
+  } while (0)
+
+// Appends warn/fatal errors to a list, return false if fatal
+#define ERR_APPEND(cmd, list)     \
+  do {                            \
+    ErrResult err = (cmd);        \
+    if (err.errType != ERR_NONE)  \
+      list.push_back(err);        \
+    if (err.errType == ERR_FATAL) \
+      return false;               \
+  } while (0)
+
+namespace TransferBench
+{
+// Helper functions ('hidden' in anonymous namespace)
+//========================================================================================
+namespace {
+
+// Constants
+//========================================================================================
+  int   constexpr MAX_BLOCKSIZE  = 512;                       // Max threadblock size
+  int   constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64;        // Max wavegroups/warps
+  int   constexpr MAX_UNROLL     = 8;                         // Max unroll factor
+  int   constexpr MAX_SRCS       = 8;                         // Max # srcs per Transfer
+  int   constexpr MAX_DSTS       = 8;                         // Max # dsts per Transfer
+  int   constexpr MEMSET_CHAR    = 75;                        // Value to memset (char)
+  float constexpr MEMSET_VAL     = 13323083.0f;               // Value to memset (double)
+
+// Parsing-related functions
+//========================================================================================
+
+  static ErrResult CharToMemType(char const c, MemType& memType)
+  {
+    char const* val = strchr(MemTypeStr, toupper(c));
+    if (val) {
+      memType = (MemType)(val - MemTypeStr);
+      return ERR_NONE;
+    }
+    return {ERR_FATAL, "Unexpected memory type (%c)", c};
+  }
+
+  static ErrResult CharToExeType(char const c, ExeType& exeType)
+  {
+    char const* val = strchr(ExeTypeStr, toupper(c));
+    if (val) {
+      exeType = (ExeType)(val - ExeTypeStr);
+      return ERR_NONE;
+    }
+    return {ERR_FATAL, "Unexpected executor type (%c)", c};
+  }
+
+  static ErrResult ParseMemType(std::string const& token,
+                                std::vector<MemDevice>& memDevices)
+  {
+    char memTypeChar;
+    int offset = 0, memIndex, inc;
+    MemType memType;
+    bool found = false;
+
+    memDevices.clear();
+    while (sscanf(token.c_str() + offset, " %c %d%n", &memTypeChar, &memIndex, &inc) == 2) {
+      offset += inc;
+
+      ErrResult err = CharToMemType(memTypeChar, memType);
+      if (err.errType != ERR_NONE) return err;
+
+      if (memType != MEM_NULL)
+        memDevices.push_back({memType, memIndex});
+      found = true;
+    }
+    if (found) return ERR_NONE;
+    return {ERR_FATAL,
+            "Unable to parse memory type token %s.  Expected one of %s followed by an index",
+            token.c_str(), MemTypeStr};
+  }
+
+  static ErrResult ParseExeType(std::string const& token,
+                                ExeDevice& exeDevice,
+                                int& exeSubIndex)
+  {
+    char exeTypeChar;
+    exeSubIndex = -1;
+
+    int numTokensParsed = sscanf(token.c_str(),
+                                 " %c%d.%d", &exeTypeChar, &exeDevice.exeIndex, &exeSubIndex);
+    if (numTokensParsed < 2) {
+      return {ERR_FATAL,
+              "Unable to parse valid executor token (%s)."
+              "Expected one of %s followed by an index",
+              token.c_str(), ExeTypeStr};
+    }
+    return CharToExeType(exeTypeChar, exeDevice.exeType);
+  }
+
+// Memory-related functions
+//========================================================================================
+  // Enable peer access between two GPUs
+  static ErrResult EnablePeerAccess(int const deviceId, int const peerDeviceId)
+  {
+    int canAccess;
+    ERR_CHECK(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
+    if (!canAccess)
+      return {ERR_FATAL,
+              "Unable to enable peer access from GPU devices %d to %d", peerDeviceId, deviceId};
+
+    ERR_CHECK(hipSetDevice(deviceId));
+    hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
+    if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled) {
+      return {ERR_FATAL,
+              "Unable to enable peer to peer access from %d to %d (%s)",
+              deviceId, peerDeviceId, hipGetErrorString(error)};
+    }
+    return ERR_NONE;
+  }
+
+  // Check that CPU memory array of numBytes has been allocated on targetId NUMA node
+  static ErrResult CheckPages(char* array, size_t numBytes, int targetId)
+  {
+    size_t const pageSize = getpagesize();
+    size_t const numPages = (numBytes + pageSize - 1) / pageSize;
+
+    std::vector<void *> pages(numPages);
+    std::vector<int> status(numPages);
+
+    pages[0] = array;
+    for (int i = 1; i < numPages; i++) {
+      pages[i] = (char*)pages[i-1] + pageSize;
+    }
+
+    long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+    if (retCode)
+      return {ERR_FATAL,
+              "Unable to collect page table information for allocated memory. "
+              "Ensure NUMA library is installed properly"};
+
+    size_t mistakeCount = 0;
+    for (size_t i = 0; i < numPages; i++) {
+      if (status[i] < 0)
+        return {ERR_FATAL,
+                "Unexpected page status (%d) for page %llu", status[i], i};
+      if (status[i] != targetId) mistakeCount++;
+    }
+    if (mistakeCount > 0) {
+      return {ERR_FATAL,
+              "%lu out of %lu pages for memory allocation were not on NUMA node %d."
+              " This could be due to hardware memory issues",
+              mistakeCount, numPages, targetId};
+    }
+    return ERR_NONE;
+  }
+
+  // Allocate memory
+  static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr)
+  {
+    if (numBytes == 0) {
+      return {ERR_FATAL, "Unable to allocate 0 bytes"};
+    }
+    *memPtr = nullptr;
+
+    MemType const& memType = memDevice.memType;
+
+    if (IsCpuMemType(memType)) {
+      // Set numa policy prior to call to hipHostMalloc
+      numa_set_preferred(memDevice.memIndex);
+
+      // Allocate host-pinned memory (should respect NUMA mem policy)
+      if (memType == MEM_CPU_FINE) {
+#if defined (__NVCC__)
+        return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"};
+#else
+        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+#endif
+      } else if (memType == MEM_CPU) {
+#if defined (__NVCC__)
+        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, 0));
+#else
+        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
+#endif
+      } else if (memType == MEM_CPU_UNPINNED) {
+        *memPtr = numa_alloc_onnode(numBytes, memDevice.memIndex);
+      }
+
+      // Check that the allocated pages are actually on the correct NUMA node
+      memset(*memPtr, 0, numBytes);
+      ERR_CHECK(CheckPages((char*)*memPtr, numBytes, memDevice.memIndex));
+
+      // Reset to default numa mem policy
+      numa_set_preferred(-1);
+    } else if (IsGpuMemType(memType)) {
+      // Switch to the appropriate GPU
+      ERR_CHECK(hipSetDevice(memDevice.memIndex));
+
+      if (memType == MEM_GPU) {
+        // Allocate GPU memory on appropriate device
+        ERR_CHECK(hipMalloc((void**)memPtr, numBytes));
+      } else if (memType == MEM_GPU_FINE) {
+#if defined (__NVCC__)
+        return {ERR_FATAL, "Fine-grained GPU memory not supported on NVIDIA platform"};
+#else
+        int flag = hipDeviceMallocUncached;
+        ERR_CHECK(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
+#endif
+      } else if (memType == MEM_MANAGED) {
+        ERR_CHECK(hipMallocManaged((void**)memPtr, numBytes));
+      }
+
+      // Clear the memory
+      ERR_CHECK(hipMemset(*memPtr, 0, numBytes));
+      ERR_CHECK(hipDeviceSynchronize());
+    } else {
+      return {ERR_FATAL, "Unsupported memory type (%d)", memType};
+    }
+    return ERR_NONE;
+  }
+
+  // Deallocate memory
+  static ErrResult DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
+  {
+    // Avoid deallocating nullptr
+    if (memPtr == nullptr)
+      return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};
+
+    switch (memType) {
+    case MEM_CPU: case MEM_CPU_FINE:
+    {
+      ERR_CHECK(hipHostFree(memPtr));
+      break;
+    }
+    case MEM_CPU_UNPINNED:
+    {
+      numa_free(memPtr, bytes);
+      break;
+    }
+    case MEM_GPU : case MEM_GPU_FINE: case MEM_MANAGED:
+    {
+      ERR_CHECK(hipFree(memPtr));
+      break;
+    }
+    default:
+      return {ERR_FATAL, "Attempting to deallocate unrecognized memory type (%d)", memType};
+    }
+    return ERR_NONE;
+  }
+
+// HSA-related functions
+//========================================================================================
+
+#if !defined(__NVCC__)
+  // Get the hsa_agent_t associated with a ExeDevice
+  static ErrResult GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent)
+  {
+    static bool isInitialized = false;
+    static std::vector<hsa_agent_t> cpuAgents;
+    static std::vector<hsa_agent_t> gpuAgents;
+
+    int const& exeIndex = exeDevice.exeIndex;
+    int const numCpus   = GetNumExecutors(EXE_CPU);
+    int const numGpus   = GetNumExecutors(EXE_GPU_GFX);
+
+    // Initialize results on first use
+    if (!isInitialized) {
+      hsa_amd_pointer_info_t info;
+      info.size = sizeof(info);
+
+      ErrResult err;
+      int32_t* tempBuffer;
+
+      // Index CPU agents
+      cpuAgents.clear();
+      for (int i = 0; i < numCpus; i++) {
+        ERR_CHECK(AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer));
+        ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
+        cpuAgents.push_back(info.agentOwner);
+        ERR_CHECK(DeallocateMemory(MEM_CPU, tempBuffer, 1024));
+      }
+
+      // Index GPU agents
+      gpuAgents.clear();
+      for (int i = 0; i < numGpus; i++) {
+        ERR_CHECK(AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer));
+        ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
+        gpuAgents.push_back(info.agentOwner);
+        ERR_CHECK(DeallocateMemory(MEM_GPU, tempBuffer, 1024));
+      }
+      isInitialized = true;
+    }
+
+    switch (exeDevice.exeType) {
+    case EXE_CPU:
+      if (exeIndex < 0 || exeIndex >= numCpus)
+        return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1};
+      agent = cpuAgents[exeDevice.exeIndex];
+      break;
+    case EXE_GPU_GFX: case EXE_GPU_DMA:
+      if (exeIndex < 0 || exeIndex >= numGpus)
+        return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
+      agent = gpuAgents[exeIndex];
+      break;
+    default:
+      return {ERR_FATAL,
+              "Attempting to get HSA agent of unknown or unsupported executor type (%d)",
+              exeDevice.exeType};
+    }
+    return ERR_NONE;
+  }
+
+  // Get the hsa_agent_t associated with a MemDevice
+  static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
+  {
+    if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
+    if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
+    return {ERR_FATAL,
+            "Unable to get HSA agent for memDevice (%d,%d)",
+            memDevice.memType, memDevice.memIndex};
+  }
+#endif
+
+// Setup validation-related functions
+//========================================================================================
+
+  // Validate that MemDevice exists
+  static ErrResult CheckMemDevice(MemDevice const& memDevice)
+  {
+    if (memDevice.memType == MEM_NULL)
+      return ERR_NONE;
+
+    if (IsCpuMemType(memDevice.memType)) {
+      int numCpus = GetNumExecutors(EXE_CPU);
+      if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus)
+        return {ERR_FATAL,
+                "CPU index must be between 0 and %d (instead of %d)", numCpus - 1, memDevice.memIndex};
+      return ERR_NONE;
+    }
+
+    if (IsGpuMemType(memDevice.memType)) {
+    int numGpus = GetNumExecutors(EXE_GPU_GFX);
+      if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus)
+        return {ERR_FATAL,
+                "GPU index must be between 0 and %d (instead of %d)", numGpus - 1, memDevice.memIndex};
+      return ERR_NONE;
+    }
+    return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType};
+  }
+
+  // Validate configuration options - return trues if and only if an fatal error is detected
+  static bool ConfigOptionsHaveErrors(ConfigOptions const&    cfg,
+                                      std::vector<ErrResult>& errors)
+  {
+    // Check general options
+    if (cfg.general.numWarmups < 0)
+      errors.push_back({ERR_FATAL, "[general.numWarmups] must be a non-negative number"});
+
+    // Check data options
+    if (cfg.data.blockBytes == 0 || cfg.data.blockBytes % 4)
+      errors.push_back({ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)});
+    if (cfg.data.byteOffset < 0 || cfg.data.byteOffset % sizeof(float))
+      errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
+
+    // Check GFX options
+    int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE);
+    if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize)
+      errors.push_back({ERR_FATAL,
+                        "[gfx.blockSize] must be positive multiple of 64 less than or equal to %d",
+                        gfxMaxBlockSize});
+
+    int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL);
+    if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll)
+      errors.push_back({ERR_FATAL,
+                        "[gfx.unrollFactor] must be non-negative and less than or equal to %d",
+                        gfxMaxUnroll});
+    if (cfg.gfx.waveOrder < 0 || cfg.gfx.waveOrder >= 6)
+      errors.push_back({ERR_FATAL,
+                        "[gfx.waveOrder] must be non-negative and less than 6"});
+
+    int numGpus = GetNumExecutors(EXE_GPU_GFX);
+    int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
+    vector<vector<int>> const& table = cfg.gfx.prefXccTable;
+
+    if (!table.empty()) {
+      if (table.size() != numGpus) {
+        errors.push_back({ERR_FATAL, "[gfx.prefXccTable] must be have size %dx%d", numGpus, numGpus});
+      } else {
+        for (int i = 0; i < table.size(); i++) {
+          if (table[i].size() != numGpus) {
+            errors.push_back({ERR_FATAL, "[gfx.prefXccTable] must be have size %dx%d", numGpus, numGpus});
+            break;
+          } else {
+            for (auto x : table[i]) {
+              if (x < 0 || x >= numXccs) {
+                errors.push_back({ERR_FATAL, "[gfx.prefXccTable] must contain values between 0 and %d",
+                    numXccs - 1});
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // NVIDIA specific
+#if defined(__NVCC__)
+    if (cfg.data.validateDirect)
+      errors.push_back({ERR_FATAL, "[data.validateDirect] is not supported on NVIDIA hardware"});
+#else
+    // AMD specific
+    // Check for largeBar enablement on GPUs
+    for (int i = 0; i < numGpus; i++) {
+      int isLargeBar = 0;
+      hipError_t err = hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, i);
+      if (err != hipSuccess) {
+        errors.push_back({ERR_FATAL, "Unable to query if GPU %d has largeBAR enabled", i});
+      } else if (!isLargeBar) {
+        errors.push_back({ERR_WARN,
+                          "Large BAR is not enabled for GPU %d in BIOS. "
+                          "Large BAR is required to enable multi-gpu data access", i});
+      }
+    }
+#endif
+
+    // Check for fatal errors
+    for (auto const& err : errors)
+      if (err.errType == ERR_FATAL) return true;
+    return false;
+  }
+
+  // Validate Transfers to execute - returns true if and only if fatal error detected
+  static bool TransfersHaveErrors(ConfigOptions         const& cfg,
+                                  std::vector<Transfer> const& transfers,
+                                  std::vector<ErrResult>&      errors)
+  {
+    int numCpus = GetNumExecutors(EXE_CPU);
+    int numGpus = GetNumExecutors(EXE_GPU_GFX);
+
+    std::set<ExeDevice>      executors;
+    std::map<ExeDevice, int> transferCount;
+    std::map<ExeDevice, int> useSubIndexCount;
+    std::map<ExeDevice, int> totalSubExecs;
+
+    // Per-Transfer checks
+    for (size_t i = 0; i < transfers.size(); i++) {
+      Transfer const& t = transfers[i];
+
+      if (t.numBytes == 0)
+        errors.push_back({ERR_FATAL, "Transfer %d: Cannot perform 0-byte transfers", i});
+
+      if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) {
+        size_t const N               = t.numBytes / sizeof(float);
+        int    const targetMultiple  = cfg.data.blockBytes / sizeof(float);
+        int    const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
+                                                (size_t)t.numSubExecs);
+
+        if (maxSubExecToUse < t.numSubExecs)
+          errors.push_back({ERR_WARN,
+                            "Transfer %d data size is too small - will only use %d of %d subexecutors",
+                            i, maxSubExecToUse, t.numSubExecs});
+      }
+
+      // Check sources and destinations
+      if (t.srcs.empty() && t.dsts.empty())
+        errors.push_back({ERR_FATAL, "Transfer %d: Must have at least one source or destination", i});
+
+      for (int j = 0; j < t.srcs.size(); j++) {
+        ErrResult err = CheckMemDevice(t.srcs[j]);
+        if (err.errType != ERR_NONE)
+          errors.push_back({ERR_FATAL, "Transfer %d: SRC %d: %s", i, j, err.errMsg.c_str()});
+      }
+      for (int j = 0; j < t.dsts.size(); j++) {
+        ErrResult err = CheckMemDevice(t.dsts[j]);
+        if (err.errType != ERR_NONE)
+          errors.push_back({ERR_FATAL, "Transfer %d: DST %d: %s", i, j, err.errMsg.c_str()});
+      }
+
+      // Check executor
+      executors.insert(t.exeDevice);
+      transferCount[t.exeDevice]++;
+      switch (t.exeDevice.exeType) {
+      case EXE_CPU:
+        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numCpus)
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: CPU index must be between 0 and %d (instead of %d)",
+                            i, numCpus - 1, t.exeDevice.exeIndex});
+        break;
+      case EXE_GPU_GFX:
+        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: GFX index must be between 0 and %d (instead of %d)",
+                            i, numGpus - 1, t.exeDevice.exeIndex});
+        } else {
+          if (t.exeSubIndex != -1) {
+#if defined(__NVCC__)
+            errors.push_back({ERR_FATAL,
+                              "Transfer %d: GFX executor subindex not supported on NVIDIA hardware", i});
+#else
+            useSubIndexCount[t.exeDevice]++;
+            int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
+            if (t.exeSubIndex >= numSubIndices)
+              errors.push_back({ERR_FATAL,
+                                "Transfer %d: GFX subIndex (XCC) must be between 0 and %d", i, numSubIndices - 1});
+#endif
+          }
+        }
+        break;
+      case EXE_GPU_DMA:
+        if (t.srcs.size() != 1 || t.dsts.size() != 1) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: DMA executor must have exactly 1 source and 1 destination", i});
+        }
+
+        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: DMA index must be between 0 and %d (instead of %d)",
+                            i, numGpus - 1, t.exeDevice.exeIndex});
+          // Cannot proceed with any further checks
+          continue;
+        }
+
+        if (t.exeSubIndex != -1) {
+#if defined(__NVCC__)
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: DMA executor subindex not supported on NVIDIA hardware", i});
+#else
+          useSubIndexCount[t.exeDevice]++;
+          int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
+          if (t.exeSubIndex >= numSubIndices)
+            errors.push_back({ERR_FATAL,
+                              "Transfer %d: DMA subIndex (engine) must be between 0 and %d",
+                              i, numSubIndices - 1});
+
+          // Check that engine Id exists between agents
+          hsa_agent_t srcAgent, dstAgent;
+          ErrResult err;
+          err = GetHsaAgent(t.srcs[0], srcAgent);
+          if (err.errType != ERR_NONE) {
+            errors.push_back(err);
+            if (err.errType == ERR_FATAL) break;
+          }
+          err = GetHsaAgent(t.dsts[0], dstAgent);
+          if (err.errType != ERR_NONE) {
+            errors.push_back(err);
+            if (err.errType == ERR_FATAL) break;
+          }
+
+          uint32_t engineIdMask = 0;
+          err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
+          if (err.errType != ERR_NONE) {
+            errors.push_back(err);
+            if (err.errType == ERR_FATAL) break;
+          }
+          hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
+          if (!(sdmaEngineId & engineIdMask)) {
+            errors.push_back({ERR_FATAL,
+                "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
+                i, t.exeDevice.exeIndex, t.exeSubIndex});
+          }
+#endif
+        }
+
+        if (!IsGpuMemType(t.srcs[0].memType) && !IsGpuMemType(t.dsts[0].memType)) {
+          errors.push_back({ERR_WARN,
+              "Transfer %d: No GPU memory for source or destination.  Copy might not execute on DMA %d",
+              i, t.exeDevice.exeIndex});
+        } else {
+          // Currently HIP will use src agent if source memory is GPU, otherwise dst agent
+          if (IsGpuMemType(t.srcs[0].memType)) {
+            if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
+              errors.push_back({ERR_WARN,
+                  "Transfer %d: DMA executor will automatically switch to using the source memory device (%d) not (%d)",
+                  i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
+            }
+          } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
+            errors.push_back({ERR_WARN,
+                "Transfer %d: DMA executor will automatically switch to using the destination memory device (%d) not (%d)",
+                i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
+          }
+        }
+        break;
+      case EXE_IBV:
+        errors.push_back({ERR_FATAL, "Transfer %d: IBV executor currently not supported", i});
+        break;
+      }
+
+      // Check subexecutors
+      if (t.numSubExecs <= 0)
+        errors.push_back({ERR_FATAL, "Transfer %d: # of subexecutors must be positive", i});
+      else
+        totalSubExecs[t.exeDevice] += t.numSubExecs;
+    }
+
+    int gpuMaxHwQueues = 4;
+    if (getenv("GPU_MAX_HW_QUEUES"))
+      gpuMaxHwQueues = atoi(getenv("GPU_MAX_HW_QUEUES"));
+
+    // Aggregate checks
+    for (auto const& exeDevice : executors) {
+      switch (exeDevice.exeType) {
+      case EXE_CPU:
+      {
+        // Check total number of subexecutors requested
+        int numCpuSubExec = GetNumSubExecutors(exeDevice);
+        if (totalSubExecs[exeDevice] > numCpuSubExec)
+          errors.push_back({ERR_WARN,
+                            "CPU %d requests %d total cores however only %d available. "
+                            "Serialization will occur",
+                            exeDevice.exeIndex, totalSubExecs[exeDevice], numCpuSubExec});
+        break;
+      }
+      case EXE_GPU_GFX:
+      {
+        // Check total number of subexecutors requested
+        int numGpuSubExec = GetNumSubExecutors(exeDevice);
+        if (totalSubExecs[exeDevice] > numGpuSubExec)
+          errors.push_back({ERR_WARN,
+                            "GPU %d requests %d total CUs however only %d available. "
+                            "Serialization will occur",
+                            exeDevice.exeIndex, totalSubExecs[exeDevice], numGpuSubExec});
+        // Check that if executor subindices are used, all Transfers specify executor subindices
+        if (useSubIndexCount[exeDevice] > 0 && useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
+          errors.push_back({ERR_FATAL,
+                            "GPU %d specifies XCC on only %d of %d Transfers. "
+                            "Must either specific none or all",
+                            exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]});
+        }
+
+        if (cfg.gfx.useMultiStream && transferCount[exeDevice] > gpuMaxHwQueues) {
+          errors.push_back({ERR_WARN,
+                            "GPU %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d",
+                            exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues});
+        }
+        break;
+      }
+      case EXE_GPU_DMA:
+      {
+        // Check that if executor subindices are used, all Transfers specify executor subindices
+        if (useSubIndexCount[exeDevice] > 0 && useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
+          errors.push_back({ERR_FATAL,
+                            "DMA %d specifies engine on only %d of %d Transfers. "
+                            "Must either specific none or all",
+                            exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]});
+        }
+        if (transferCount[exeDevice] > gpuMaxHwQueues) {
+          errors.push_back({ERR_WARN,
+                           "DMA %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d",
+                           exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues});
+        }
+
+        char* enableSdma = getenv("HSA_ENABLE_SDMA");
+        if (enableSdma && !strcmp(enableSdma, "0"))
+          errors.push_back({ERR_WARN,
+                            "DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0. "
+                            "DMA %d copies will fallback to blit (GFX) kernels", exeDevice.exeIndex});
+        break;
+      }
+      default:
+        break;
+      }
+    }
+
+
+    // Check for fatal errors
+    for (auto const& err : errors)
+      if (err.errType == ERR_FATAL) return true;
+    return false;
+  }
+
+// Internal data structures
+//========================================================================================
+
+  // Parameters for each SubExecutor
+  struct SubExecParam
+  {
+    // Inputs
+    size_t                     N;                 ///< Number of floats this subExecutor works on
+    int                        numSrcs;           ///< Number of source arrays
+    int                        numDsts;           ///< Number of destination arrays
+    float*                     src[MAX_SRCS];     ///< Source array pointers
+    float*                     dst[MAX_DSTS];     ///< Destination array pointers
+    int32_t                    preferredXccId;    ///< XCC ID to execute on (GFX only)
+
+    // Prepared
+    int                        teamSize;          ///< Index of this sub executor amongst team
+    int                        teamIdx;           ///< Size of team this sub executor is part of
+
+    // Outputs
+    long long                  startCycle;        ///< Start timestamp for in-kernel timing (GPU-GFX executor)
+    long long                  stopCycle;         ///< Stop  timestamp for in-kernel timing (GPU-GFX executor)
+    uint32_t                   hwId;              ///< Hardware ID
+    uint32_t                   xccId;             ///< XCC ID
+  };
+
+  // Internal resources allocated per Transfer
+  struct TransferResources
+  {
+    int                        transferIdx;       ///< The associated Transfer
+    size_t                     numBytes;          ///< Number of bytes to Transfer
+    vector<float*>             srcMem;            ///< Source memory
+    vector<float*>             dstMem;            ///< Destination memory
+    vector<SubExecParam>       subExecParamCpu;   ///< Defines subarrays for each subexecutor
+    vector<int>                subExecIdx;        ///< Indices into subExecParamGpu
+
+    // For GFX executor
+    SubExecParam*              subExecParamGpuPtr;
+
+    // For targeted-SDMA
+#if !defined(__NVCC__)
+    hsa_agent_t                dstAgent;          ///< DMA destination memory agent
+    hsa_agent_t                srcAgent;          ///< DMA source memory agent
+    hsa_signal_t               signal;            ///< HSA signal for completion
+    hsa_amd_sdma_engine_id_t   sdmaEngineId;      ///< DMA engine ID
+#endif
+
+    // Counters
+    double                     totalDurationMsec; ///< Total duration for all iterations for this Transfer
+    vector<double>             perIterMsec;       ///< Duration for each individual iteration
+    vector<set<pair<int,int>>> perIterCUs;        ///< GFX-Executor only. XCC:CU used per iteration
+  };
+
+  // Internal resources allocated per Executor
+  struct ExeInfo
+  {
+    size_t                     totalBytes;        ///< Total bytes this executor transfers
+    double                     totalDurationMsec; ///< Total duration for all iterations for this Executor
+    int                        totalSubExecs;     ///< Total number of subExecutors to use
+    bool                       useSubIndices;     ///< Use subexecutor indicies
+    int                        numSubIndices;     ///< Number of subindices this ExeDevice has
+    int                        wallClockRate;     ///< (GFX-only) Device wall clock rate
+    vector<SubExecParam>       subExecParamCpu;   ///< Subexecutor parameters for this executor
+    vector<TransferResources>  resources;         ///< Per-Transfer resources
+
+    // For GPU-Executors
+    SubExecParam*              subExecParamGpu;   ///< GPU copy of subExecutor parameters
+    vector<hipStream_t>        streams;           ///< HIP streams to launch on
+    vector<hipEvent_t>         startEvents;       ///< HIP start timing event
+    vector<hipEvent_t>         stopEvents;        ///< HIP stop timing event
+  };
+
+// Data validation-related functions
+//========================================================================================
+
+  // Pseudo-random formula for each element in array
+  static __host__ float PrepSrcValue(int srcBufferIdx, size_t idx)
+  {
+    return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
+  }
+
+  // Fills a pre-sized buffer with the pattern, based on which src index buffer
+  // Note: Can also generate expected dst buffer
+  static void PrepareReference(ConfigOptions const& cfg, std::vector<float>& cpuBuffer, int bufferIdx)
+  {
+    size_t N = cpuBuffer.size();
+
+    // Source buffer
+    if (bufferIdx >= 0) {
+      // Use fill pattern if specified
+      size_t patternLen = cfg.data.fillPattern.size();
+      if (patternLen > 0) {
+        size_t copies   = N / patternLen;
+        size_t leftOver = N % patternLen;
+        float* cpuBufferPtr = cpuBuffer.data();
+        for (int i = 0; i < copies; i++) {
+          memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), patternLen * sizeof(float));
+          cpuBufferPtr += patternLen;
+        }
+        if (leftOver)
+          memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), leftOver * sizeof(float));
+      } else {
+        for (size_t i = 0; i < N; ++i)
+          cpuBuffer[i] = PrepSrcValue(bufferIdx, i);
+      }
+    } else { // Destination buffer
+      int numSrcs = -bufferIdx - 1;
+
+      if (numSrcs == 0) {
+        // Note: 0x75757575 = 13323083.0
+        memset(cpuBuffer.data(), MEMSET_CHAR, N * sizeof(float));
+      } else {
+        PrepareReference(cfg, cpuBuffer, 0);
+        if (numSrcs > 1) {
+          std::vector<float> temp(N);
+          for (int i = 1; i < numSrcs; i++) {
+            PrepareReference(cfg, temp, i);
+            for (int j = 0; j < N; j++) {
+              cpuBuffer[i] += temp[i];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Checks that destination buffers match expected values
+  static ErrResult ValidateAllTransfers(ConfigOptions              const& cfg,
+                                        vector<Transfer>           const& transfers,
+                                        vector<TransferResources*> const& transferResources,
+                                        vector<vector<float>>      const& dstReference,
+                                        vector<float>&                    outputBuffer)
+  {
+    float* output;
+    size_t initOffset = cfg.data.byteOffset / sizeof(float);
+
+    for (auto resource : transferResources) {
+      int transferIdx = resource->transferIdx;
+      Transfer const& t = transfers[transferIdx];
+      size_t N = t.numBytes / sizeof(float);
+
+      float const* expected = dstReference[t.srcs.size()].data();
+      for (int dstIdx = 0; dstIdx < resource->dstMem.size(); dstIdx++) {
+        if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) {
+          output = (resource->dstMem[dstIdx]) + initOffset;
+        } else {
+          ERR_CHECK(hipMemcpy(outputBuffer.data(), (resource->dstMem[dstIdx]) + initOffset, t.numBytes, hipMemcpyDefault));
+          ERR_CHECK(hipDeviceSynchronize());
+          output = outputBuffer.data();
+        }
+
+        if (memcmp(output, expected, t.numBytes)) {
+          // Difference found - find first error
+          for (size_t i = 0; i < N; i++) {
+            if (output[i] != expected[i]) {
+              return {ERR_FATAL, "Transfer %d: Unexpected mismatch at index %lu of destination %d: Expected %10.5f Actual: %10.5f",
+                transferIdx, i, dstIdx, expected[i], output[i]};
+            }
+          }
+          return {ERR_FATAL, "Transfer %d: Unexpected output mismatch for destination %d", transferIdx, dstIdx};
+        }
+      }
+    }
+    return ERR_NONE;
+  }
+
+// Preparation-related functions
+//========================================================================================
+
+  // Prepares input parameters for each subexecutor
+  // Determines how sub-executors will split up the work
+  // Initializes counters
+  static ErrResult PrepareSubExecParams(ConfigOptions const& cfg,
+                                        Transfer      const& transfer,
+                                        TransferResources&   resources)
+  {
+    // Each subExecutor needs to know src/dst pointers and how many elements to transfer
+    // Figure out the sub-array each subExecutor works on for this Transfer
+    // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of data.blockBytes
+    //   except the very last one, for alignment reasons
+    size_t const N              = transfer.numBytes / sizeof(float);
+    int    const initOffset     = cfg.data.byteOffset / sizeof(float);
+    int    const targetMultiple = cfg.data.blockBytes / sizeof(float);
+
+    // In some cases, there may not be enough data for all subExectors
+    int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
+                                         (size_t)transfer.numSubExecs);
+
+    vector<SubExecParam>& subExecParam = resources.subExecParamCpu;
+    subExecParam.clear();
+    subExecParam.resize(transfer.numSubExecs);
+
+    size_t assigned = 0;
+    for (int i = 0; i < transfer.numSubExecs; ++i) {
+      SubExecParam& p  = subExecParam[i];
+      p.numSrcs        = resources.srcMem.size();
+      p.numDsts        = resources.dstMem.size();
+      p.startCycle     = 0;
+      p.stopCycle      = 0;
+      p.hwId           = 0;
+      p.xccId          = 0;
+
+      // In single team mode, subexecutors stripe across the entire array
+      if (cfg.gfx.useSingleTeam && transfer.exeDevice.exeType == EXE_GPU_GFX) {
+        p.N        = N;
+        p.teamSize = transfer.numSubExecs;
+        p.teamIdx  = i;
+        for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = resources.srcMem[iSrc] + initOffset;
+        for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = resources.dstMem[iDst] + initOffset;
+      } else {
+        // Otherwise, each subexecutor works on separate subarrays
+        int    const subExecLeft = std::max(0, maxSubExecToUse - i);
+        size_t const leftover    = N - assigned;
+        size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
+
+        p.N        = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
+        p.teamSize = 1;
+        p.teamIdx  = 0;
+        for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = resources.srcMem[iSrc] + initOffset + assigned;
+        for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = resources.dstMem[iDst] + initOffset + assigned;
+        assigned += p.N;
+      }
+
+      p.preferredXccId = transfer.exeSubIndex;
+      // Override if XCC table has been specified
+      vector<vector<int>> const& table = cfg.gfx.prefXccTable;
+      if (transfer.exeDevice.exeType == EXE_GPU_GFX && transfer.exeSubIndex == -1 && !table.empty() &&
+          transfer.dsts.size() == 1 && IsGpuMemType(transfer.dsts[0].memType)) {
+        if (table.size() <= transfer.exeDevice.exeIndex ||
+            table[transfer.exeDevice.exeIndex].size() <= transfer.dsts[0].memIndex) {
+          return {ERR_FATAL, "[gfx.xccPrefTable] is too small"};
+        }
+        p.preferredXccId = table[transfer.exeDevice.exeIndex][transfer.dsts[0].memIndex];
+        if (p.preferredXccId < 0 || p.preferredXccId >= GetNumExecutorSubIndices(transfer.exeDevice)) {
+          return {ERR_FATAL, "[gfx.xccPrefTable] defines out-of-bound XCC index %d", p.preferredXccId};
+        }
+      }
+    }
+
+    // Clear counters
+    resources.totalDurationMsec = 0.0;
+
+    return ERR_NONE;
+  }
+
+  // Prepare each executor
+  // Allocates memory for src/dst, prepares subexecutors, executor-specific data structures
+  static ErrResult PrepareExecutor(ConfigOptions    const& cfg,
+                                   vector<Transfer> const& transfers,
+                                   ExeDevice        const& exeDevice,
+                                   ExeInfo&                exeInfo)
+  {
+    exeInfo.totalDurationMsec = 0.0;
+
+    // Loop over each transfer this executor is involved in
+    for (auto& resources : exeInfo.resources) {
+      Transfer const& t = transfers[resources.transferIdx];
+      resources.numBytes = t.numBytes;
+
+      // Allocate source memory
+      resources.srcMem.resize(t.srcs.size());
+      for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
+        MemDevice const& srcMemDevice = t.srcs[iSrc];
+
+        // Ensure executing GPU can access source memory
+        if (exeDevice.exeType == EXE_GPU_GFX && IsGpuMemType(srcMemDevice.memType) &&
+            srcMemDevice.memIndex != exeDevice.exeIndex) {
+          ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, srcMemDevice.memIndex));
+        }
+        ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&resources.srcMem[iSrc]));
+      }
+
+      // Allocate destination memory
+      resources.dstMem.resize(t.dsts.size());
+      for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
+        MemDevice const& dstMemDevice = t.dsts[iDst];
+
+        // Ensure executing GPU can access destination memory
+        if (exeDevice.exeType == EXE_GPU_GFX && IsGpuMemType(dstMemDevice.memType) &&
+            dstMemDevice.memIndex != exeDevice.exeIndex) {
+          ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, dstMemDevice.memIndex));
+        }
+        ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&resources.dstMem[iDst]));
+      }
+
+      if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
+#if !defined(__NVCC__)
+        // Collect HSA agent information
+        hsa_amd_pointer_info_t info;
+        info.size = sizeof(info);
+        ERR_CHECK(hsa_amd_pointer_info(resources.dstMem[0], &info, NULL, NULL, NULL));
+        resources.dstAgent = info.agentOwner;
+
+        ERR_CHECK(hsa_amd_pointer_info(resources.srcMem[0], &info, NULL, NULL, NULL));
+        resources.srcAgent = info.agentOwner;
+
+        // Create HSA completion signal
+        ERR_CHECK(hsa_signal_create(1, 0, NULL, &resources.signal));
+
+        if (t.exeSubIndex != -1)
+          resources.sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
+#endif
+      }
+
+      // Prepare subexecutor parameters
+      ERR_CHECK(PrepareSubExecParams(cfg, t, resources));
+    }
+
+    // Prepare additional requirements for GPU-based executors
+    if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
+      ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
+
+      // Determine how many streams to use
+      int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA ||
+                                   (exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream))
+        ? exeInfo.resources.size() : 1;
+      exeInfo.streams.resize(numStreamsToUse);
+
+      // Create streams
+      for (int i = 0; i < numStreamsToUse; ++i) {
+        if (cfg.gfx.cuMask.size()) {
+#if !defined(__NVCC__)
+          ERR_CHECK(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], cfg.gfx.cuMask.size(),
+                                                 cfg.gfx.cuMask.data()));
+#else
+          return {ERR_FATAL, "CU Masking in not supported on NVIDIA hardware"};
+#endif
+        } else {
+          ERR_CHECK(hipStreamCreate(&exeInfo.streams[i]));
+        }
+      }
+
+      if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
+        exeInfo.startEvents.resize(numStreamsToUse);
+        exeInfo.stopEvents.resize(numStreamsToUse);
+        for (int i = 0; i < numStreamsToUse; ++i) {
+          ERR_CHECK(hipEventCreate(&exeInfo.startEvents[i]));
+          ERR_CHECK(hipEventCreate(&exeInfo.stopEvents[i]));
+        }
+      }
+    }
+
+    // Prepare for GPU GFX executor
+    if (exeDevice.exeType == EXE_GPU_GFX) {
+      // Allocate one contiguous chunk of GPU memory for threadblock parameters
+      // This allows support for executing one transfer per stream, or all transfers in a single stream
+#if !defined(__NVCC__)
+      MemType memType = MEM_GPU;      // AMD hardware can directly access GPU memory from host
+#else
+      MemType memType = MEM_MANAGED;  // NVIDIA hardware requires managed memory to access from host
+#endif
+      ERR_CHECK(AllocateMemory({memType, exeDevice.exeIndex}, exeInfo.totalSubExecs * sizeof(SubExecParam),
+                               (void**)&exeInfo.subExecParamGpu));
+
+      // Create subexecutor parameter array for entire executor
+      exeInfo.subExecParamCpu.clear();
+      exeInfo.numSubIndices = GetNumExecutorSubIndices(exeDevice);
+#if defined(__NVCC__)
+      exeInfo.wallClockRate = 1000000;
+#else
+      ERR_CHECK(hipDeviceGetAttribute(&exeInfo.wallClockRate, hipDeviceAttributeWallClockRate,
+                                      exeDevice.exeIndex));
+#endif
+      int transferOffset = 0;
+      for (auto& resources : exeInfo.resources) {
+        Transfer const& t = transfers[resources.transferIdx];
+        resources.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
+        for (auto p : resources.subExecParamCpu) {
+          resources.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
+          exeInfo.subExecParamCpu.push_back(p);
+          transferOffset++;
+        }
+      }
+
+      // Copy sub executor parameters to GPU
+      ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
+      ERR_CHECK(hipMemcpy(exeInfo.subExecParamGpu,
+                          exeInfo.subExecParamCpu.data(),
+                          exeInfo.totalSubExecs * sizeof(SubExecParam),
+                          hipMemcpyHostToDevice));
+      ERR_CHECK(hipDeviceSynchronize());
+    }
+
+    return ERR_NONE;
+  }
+
+// Teardown-related functions
+//========================================================================================
+
+  // Clean up all resources
+  static ErrResult TeardownExecutor(ConfigOptions    const& cfg,
+                                    ExeDevice        const& exeDevice,
+                                    vector<Transfer> const& transfers,
+                                    ExeInfo&                exeInfo)
+  {
+    // Loop over each transfer this executor is involved in
+    for (auto& resources : exeInfo.resources) {
+      Transfer const& t = transfers[resources.transferIdx];
+
+      // Deallocate source memory
+      for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
+        ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, resources.srcMem[iSrc], t.numBytes + cfg.data.byteOffset));
+      }
+
+      // Deallocate destination memory
+      for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
+        ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, resources.dstMem[iDst], t.numBytes + cfg.data.byteOffset));
+      }
+
+      // Destroy HSA signal for DMA executor
+#if !defined(__NVCC__)
+      if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
+        ERR_CHECK(hsa_signal_destroy(resources.signal));
+      }
+#endif
+    }
+
+    // Teardown additional requirements for GPU-based executors
+    if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
+      for (auto stream : exeInfo.streams)
+        ERR_CHECK(hipStreamDestroy(stream));
+      if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
+        for (auto event : exeInfo.startEvents)
+          ERR_CHECK(hipEventDestroy(event));
+        for (auto event : exeInfo.stopEvents)
+          ERR_CHECK(hipEventDestroy(event));
+      }
+    }
+
+    if (exeDevice.exeType == EXE_GPU_GFX) {
+#if !defined(__NVCC__)
+      MemType memType = MEM_GPU;
+#else
+      MemType memType = MEM_MANAGED;
+#endif
+      ERR_CHECK(DeallocateMemory(memType, exeInfo.subExecParamGpu, exeInfo.totalSubExecs * sizeof(SubExecParam)));
+    }
+
+    return ERR_NONE;
+  }
+
+// CPU Executor-related functions
+//========================================================================================
+
+  // Kernel for CPU execution (run by a single subexecutor)
+  static void CpuReduceKernel(SubExecParam const& p)
+  {
+    if (p.N == 0) return;
+
+    int const& numSrcs = p.numSrcs;
+    int const& numDsts = p.numDsts;
+
+    if (numSrcs == 0) {
+      for (int i = 0; i < numDsts; ++i) {
+        memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
+        //for (int j = 0; j < p.N; j++) p.dst[i][j] = MEMSET_VAL;
+      }
+    } else if (numSrcs == 1) {
+      float const* __restrict__ src = p.src[0];
+      if (numDsts == 0) {
+        float sum = 0.0;
+        for (int j = 0; j < p.N; j++)
+          sum += p.src[0][j];
+
+        // Add a dummy check to ensure the read is not optimized out
+        if (sum != sum) {
+          printf("[ERROR] Nan detected\n");
+        }
+      } else {
+        for (int i = 0; i < numDsts; ++i)
+          memcpy(p.dst[i], src, p.N * sizeof(float));
+      }
+    } else {
+      float sum = 0.0f;
+      for (int j = 0; j < p.N; j++) {
+        sum = p.src[0][j];
+        for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
+        for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
+      }
+    }
+  }
+
+  // Execution of a single CPU Transfers
+  static void ExecuteCpuTransfer(int           const  iteration,
+                                 ConfigOptions const& cfg,
+                                 int           const  exeIndex,
+                                 TransferResources&   resources)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    vector<std::thread> childThreads;
+    int subIteration = 0;
+    do {
+      for (auto const& subExecParam : resources.subExecParamCpu)
+        childThreads.emplace_back(std::thread(CpuReduceKernel, std::cref(subExecParam)));
+
+      for (auto& subExecThread : childThreads)
+        subExecThread.join();
+      childThreads.clear();
+    } while (++subIteration != cfg.general.numSubIterations);
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+
+    if (iteration >= 0) {
+      resources.totalDurationMsec += deltaMsec;
+      if (cfg.general.recordPerIteration)
+        resources.perIterMsec.push_back(deltaMsec);
+    }
+  }
+
+  // Execution of a single CPU executor
+  static ErrResult RunCpuExecutor(int           const  iteration,
+                                  ConfigOptions const& cfg,
+                                  int           const  exeIndex,
+                                  ExeInfo&             exeInfo)
+  {
+    numa_run_on_node(exeIndex);
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    vector<std::thread> asyncTransfers;
+    for (auto& resource : exeInfo.resources) {
+      asyncTransfers.emplace_back(std::thread(ExecuteCpuTransfer,
+                                              iteration,
+                                              std::cref(cfg),
+                                              exeIndex,
+                                              std::ref(resource)));
+    }
+    for (auto& asyncTransfer : asyncTransfers)
+      asyncTransfer.join();
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    if (iteration >= 0)
+      exeInfo.totalDurationMsec += deltaMsec;
+    return ERR_NONE;
+  }
+
+// GFX Executor-related functions
+//========================================================================================
+
+  // Converts register value to a CU/SM index
+  static uint32_t GetId(uint32_t hwId)
+  {
+#if defined(__NVCC_)
+    return hwId;
+#else
+    // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
+    int const shId = (hwId >> 12) &  1;
+    int const cuId = (hwId >>  8) & 15;
+    int const seId = (hwId >> 13) &  3;
+    return (shId << 5) + (cuId << 2) + seId;
+#endif
+  }
+
+  // Device level timestamp function
+  __device__ int64_t GetTimestamp()
+  {
+#if defined(__NVCC__)
+    int64_t result;
+    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
+    return result;
+#else
+    return wall_clock64();
+#endif
+  }
+
+  // Helper function for memset
+  template <typename T> __device__ __forceinline__ T      MemsetVal();
+  template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
+  template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL,
+                                                                                          MEMSET_VAL,
+                                                                                          MEMSET_VAL,
+                                                                                          MEMSET_VAL); }
+
+  // Kernel for GFX execution
+  template <int BLOCKSIZE, int UNROLL>
+  __global__ void __launch_bounds__(BLOCKSIZE)
+    GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
+  {
+    int64_t startCycle;
+    if (threadIdx.x == 0) startCycle = GetTimestamp();
+
+    SubExecParam& p = params[blockIdx.y];
+
+    // Filter by XCC
+#if !defined(__NVCC__)
+    int32_t xccId;
+    GetXccId(xccId);
+    if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
+#endif
+
+    // Collect data information
+    int32_t const  numSrcs  = p.numSrcs;
+    int32_t const  numDsts  = p.numDsts;
+    float4  const* __restrict__ srcFloat4[MAX_SRCS];
+    float4*        __restrict__ dstFloat4[MAX_DSTS];
+    for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i];
+    for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i];
+
+    // Operate on wavefront granularity
+    int32_t const nTeams   = p.teamSize;             // Number of threadblocks working together on this subarray
+    int32_t const teamIdx  = p.teamIdx;              // Index of this threadblock within the team
+    int32_t const nWaves   = BLOCKSIZE   / warpSize; // Number of wavefronts within this threadblock
+    int32_t const waveIdx  = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
+    int32_t const tIdx     = threadIdx.x % warpSize; // Thread index within wavefront
+
+    size_t  const numFloat4 = p.N / 4;
+
+    int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
+    switch (waveOrder) {
+    case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+    case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams;  teamStride2 = 1;      waveStride2 = nTeams; break;
+    case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+    case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+    case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL;  teamStride2 = 1;      waveStride2 = nTeams; break;
+    case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves;  teamStride2 = 1;      waveStride2 = nTeams; break;
+    }
+
+    int subIterations = 0;
+    while (1) {
+      // First loop: Each wavefront in the team works on UNROLL float4s per thread
+      size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
+      size_t const loop1Limit  = numFloat4 / loop1Stride * loop1Stride;
+      {
+        float4 val[UNROLL];
+        if (numSrcs == 0) {
+          #pragma unroll
+          for (int u = 0; u < UNROLL; u++)
+            val[u] = MemsetVal<float4>();
+        }
+
+        for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
+          // Read sources into memory and accumulate in registers
+          if (numSrcs) {
+            for (int u = 0; u < UNROLL; u++)
+              val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
+            for (int s = 1; s < numSrcs; s++)
+              for (int u = 0; u < UNROLL; u++)
+                val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
+          }
+
+          // Write accumulation to all outputs
+          for (int d = 0; d < numDsts; d++) {
+            #pragma unroll
+            for (int u = 0; u < UNROLL; u++)
+              dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
+          }
+        }
+      }
+
+      // Second loop: Deal with remaining float4s
+      {
+        if (loop1Limit < numFloat4) {
+          float4 val;
+          if (numSrcs == 0) val = MemsetVal<float4>();
+
+          size_t const loop2Stride = nTeams * nWaves * warpSize;
+          for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
+               idx < numFloat4; idx += loop2Stride) {
+            if (numSrcs) {
+              val = srcFloat4[0][idx];
+              for (int s = 1; s < numSrcs; s++)
+                val += srcFloat4[s][idx];
+            }
+            for (int d = 0; d < numDsts; d++)
+              dstFloat4[d][idx] = val;
+          }
+        }
+      }
+
+      // Third loop; Deal with remaining floats
+      {
+        if (numFloat4 * 4 < p.N) {
+          float val;
+          if (numSrcs == 0) val = MemsetVal<float>();
+
+          size_t const loop3Stride = nTeams * nWaves * warpSize;
+          for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
+            if (numSrcs) {
+              val = p.src[0][idx];
+              for (int s = 1; s < numSrcs; s++)
+                val += p.src[s][idx];
+            }
+
+            for (int d = 0; d < numDsts; d++)
+              p.dst[d][idx] = val;
+          }
+        }
+      }
+
+      if (++subIterations == numSubIterations) break;
+    }
+
+    // Wait for all threads to finish
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      __threadfence_system();
+      p.stopCycle  = GetTimestamp();
+      p.startCycle = startCycle;
+      GetHwId(p.hwId);
+      GetXccId(p.xccId);
+    }
+  }
+
+#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)   \
+    {GpuReduceKernel<BLOCKSIZE, 1>,         \
+     GpuReduceKernel<BLOCKSIZE, 2>,         \
+     GpuReduceKernel<BLOCKSIZE, 3>,         \
+     GpuReduceKernel<BLOCKSIZE, 4>,         \
+     GpuReduceKernel<BLOCKSIZE, 5>,         \
+     GpuReduceKernel<BLOCKSIZE, 6>,         \
+     GpuReduceKernel<BLOCKSIZE, 7>,         \
+     GpuReduceKernel<BLOCKSIZE, 8>}
+
+  // Table of all GPU Reduction kernel functions (templated blocksize / unroll)
+  typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
+  GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] =
+  {
+    GPU_KERNEL_UNROLL_DECL(64),
+    GPU_KERNEL_UNROLL_DECL(128),
+    GPU_KERNEL_UNROLL_DECL(192),
+    GPU_KERNEL_UNROLL_DECL(256),
+    GPU_KERNEL_UNROLL_DECL(320),
+    GPU_KERNEL_UNROLL_DECL(384),
+    GPU_KERNEL_UNROLL_DECL(448),
+    GPU_KERNEL_UNROLL_DECL(512)
+  };
+  #undef GPU_KERNEL_UNROLL_DECL
+
+  // Execute a single GPU Transfer (when using 1 stream per Transfer)
+  static ErrResult ExecuteGpuTransfer(int           const  iteration,
+                                      hipStream_t   const  stream,
+                                      hipEvent_t    const  startEvent,
+                                      hipEvent_t    const  stopEvent,
+                                      int           const  xccDim,
+                                      ConfigOptions const& cfg,
+                                      TransferResources&   resources)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    int numSubExecs = resources.subExecParamCpu.size();
+    dim3 const gridSize(xccDim, numSubExecs, 1);
+    dim3 const blockSize(cfg.gfx.blockSize, 1);
+
+#if defined(__NVCC__)
+    if (startEvent != NULL)
+      ERR_CHECK(hipEventRecord(startEvent, stream));
+
+    GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
+      <<<gridSize, blockSize, 0, stream>>>
+      (resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+    if (stopEvent != NULL)
+      ERR_CHECK(hipEventRecord(stopEvent, stream));
+#else
+    hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
+                          gridSize, blockSize, 0, stream, startEvent, stopEvent,
+                          0, resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+#endif
+
+    ERR_CHECK(hipStreamSynchronize(stream));
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+
+    if (iteration >= 0) {
+      double deltaMsec = cpuDeltaMsec;
+      if (startEvent != NULL) {
+        float gpuDeltaMsec;
+        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+        deltaMsec = gpuDeltaMsec;
+      }
+      resources.totalDurationMsec += deltaMsec;
+      if (cfg.general.recordPerIteration) {
+        resources.perIterMsec.push_back(deltaMsec);
+        std::set<std::pair<int,int>> CUs;
+        for (int i = 0; i < numSubExecs; i++) {
+          CUs.insert(std::make_pair(resources.subExecParamGpuPtr[i].xccId,
+                                    GetId(resources.subExecParamGpuPtr[i].hwId)));
+        }
+        resources.perIterCUs.push_back(CUs);
+      }
+    }
+    return ERR_NONE;
+  }
+
+  // Execute a single GPU executor
+  static ErrResult RunGpuExecutor(int           const  iteration,
+                                  ConfigOptions const& cfg,
+                                  int           const  exeIndex,
+                                  ExeInfo&             exeInfo)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    int xccDim = exeInfo.useSubIndices ? exeInfo.numSubIndices : 1;
+
+    if (cfg.gfx.useMultiStream) {
+      // Launch each Transfer separately in its own stream
+      vector<std::future<ErrResult>> asyncTransfers;
+      for (int i = 0; i < exeInfo.streams.size(); i++) {
+        asyncTransfers.emplace_back(std::async(std::launch::async,
+                                               ExecuteGpuTransfer,
+                                               iteration,
+                                               exeInfo.streams[i],
+                                               cfg.gfx.useHipEvents ? exeInfo.startEvents[i] : NULL,
+                                               cfg.gfx.useHipEvents ? exeInfo.stopEvents[i] : NULL,
+                                               xccDim,
+                                               std::cref(cfg),
+                                               std::ref(exeInfo.resources[i])));
+      }
+      for (auto& asyncTransfer : asyncTransfers)
+        ERR_CHECK(asyncTransfer.get());
+    } else {
+      // Combine all the Transfers into a single kernel launch
+      int numSubExecs = exeInfo.totalSubExecs;
+      dim3 const gridSize(xccDim, numSubExecs, 1);
+      dim3 const blockSize(cfg.gfx.blockSize, 1);
+      hipStream_t stream = exeInfo.streams[0];
+
+#if defined(__NVCC__)
+      if (cfg.gfx.useHipEvents)
+        ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream));
+
+      GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
+        <<<gridSize, blockSize, 0 , stream>>>
+        (exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+
+      if (cfg.gfx.useHipEvents)
+        ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream));
+#else
+      hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
+                            gridSize, blockSize, 0, stream,
+                            cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
+                            cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0,
+                            exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+#endif
+      ERR_CHECK(hipStreamSynchronize(stream));
+    }
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+
+    if (iteration >= 0) {
+      if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
+        float gpuDeltaMsec;
+        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
+        exeInfo.totalDurationMsec += gpuDeltaMsec;
+      } else {
+        exeInfo.totalDurationMsec += cpuDeltaMsec;
+      }
+
+      // Determine timing for each of the individual transfers that were part of this launch
+      if (!cfg.gfx.useMultiStream) {
+        for (int i = 0; i < exeInfo.resources.size(); i++) {
+          TransferResources& resources = exeInfo.resources[i];
+          long long minStartCycle = std::numeric_limits<long long>::max();
+          long long maxStopCycle  = std::numeric_limits<long long>::min();
+          std::set<std::pair<int, int>> CUs;
+
+          for (auto subExecIdx : resources.subExecIdx) {
+            minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
+            maxStopCycle  = std::max(maxStopCycle,  exeInfo.subExecParamGpu[subExecIdx].stopCycle);
+            if (cfg.general.recordPerIteration) {
+              CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
+                                        GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
+            }
+          }
+          double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
+
+          resources.totalDurationMsec += deltaMsec;
+          if (cfg.general.recordPerIteration) {
+            resources.perIterMsec.push_back(deltaMsec);
+            resources.perIterCUs.push_back(CUs);
+          }
+        }
+      }
+    }
+    return ERR_NONE;
+  }
+
+
+// DMA Executor-related functions
+//========================================================================================
+
+  // Execute a single DMA Transfer
+  static ErrResult ExecuteDmaTransfer(int           const  iteration,
+                                      bool          const  useSubIndices,
+                                      hipStream_t   const  stream,
+                                      hipEvent_t    const  startEvent,
+                                      hipEvent_t    const  stopEvent,
+                                      ConfigOptions const& cfg,
+                                      TransferResources&   resources)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    int subIterations = 0;
+    if (!useSubIndices && !cfg.dma.useHsaCopy) {
+      if (cfg.dma.useHipEvents)
+        ERR_CHECK(hipEventRecord(startEvent, stream));
+
+      // Use hipMemcpy
+      do {
+        ERR_CHECK(hipMemcpyAsync(resources.dstMem[0], resources.srcMem[0], resources.numBytes,
+                                 hipMemcpyDefault, stream));
+      } while (++subIterations != cfg.general.numSubIterations);
+
+      if (cfg.dma.useHipEvents)
+        ERR_CHECK(hipEventRecord(stopEvent, stream));
+      ERR_CHECK(hipStreamSynchronize(stream));
+    } else {
+#if defined(__NVCC__)
+      return {ERR_FATAL, "HSA copy not supported on NVIDIA hardware"};
+#else
+      // Use HSA async copy
+      do {
+        hsa_signal_store_screlease(resources.signal, 1);
+        if (!useSubIndices) {
+          ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[0], resources.dstAgent,
+                                              resources.srcMem[0], resources.srcAgent,
+                                              resources.numBytes, 0, NULL,
+                                              resources.signal));
+        } else {
+          HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[0], resources.dstAgent,
+                                                       resources.srcMem[0], resources.srcAgent,
+                                                       resources.numBytes, 0, NULL,
+                                                       resources.signal,
+                                                       resources.sdmaEngineId, true));
+        }
+        // Wait for SDMA transfer to complete
+        while(hsa_signal_wait_scacquire(resources.signal,
+                                        HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
+                                        HSA_WAIT_STATE_ACTIVE) >= 1);
+      } while (++subIterations != cfg.general.numSubIterations);
+#endif
+    }
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+
+    if (iteration >= 0) {
+      double deltaMsec = cpuDeltaMsec;
+      if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
+        float gpuDeltaMsec;
+        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+        deltaMsec = gpuDeltaMsec;
+      }
+      resources.totalDurationMsec += deltaMsec;
+      if (cfg.general.recordPerIteration)
+        resources.perIterMsec.push_back(deltaMsec);
+    }
+    return ERR_NONE;
+  }
+
+  // Execute a single DMA executor
+  static ErrResult RunDmaExecutor(int           const  iteration,
+                                  ConfigOptions const& cfg,
+                                  int           const  exeIndex,
+                                  ExeInfo&             exeInfo)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    vector<std::future<ErrResult>> asyncTransfers;
+    for (int i = 0; i < exeInfo.resources.size(); i++) {
+      asyncTransfers.emplace_back(std::async(std::launch::async,
+                                             ExecuteDmaTransfer,
+                                             iteration,
+                                             exeInfo.useSubIndices,
+                                             exeInfo.streams[i],
+                                             cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL,
+                                             cfg.dma.useHipEvents ? exeInfo.stopEvents[i]  : NULL,
+                                             std::cref(cfg),
+                                             std::ref(exeInfo.resources[i])));
+    }
+
+    for (auto& asyncTransfer : asyncTransfers)
+      ERR_CHECK(asyncTransfer.get());
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    if (iteration >= 0)
+      exeInfo.totalDurationMsec += deltaMsec;
+    return ERR_NONE;
+  }
+
+// Executor-related functions
+//========================================================================================
+  static ErrResult RunExecutor(int           const  iteration,
+                               ConfigOptions const& cfg,
+                               ExeDevice     const& exeDevice,
+                               ExeInfo&             exeInfo)
+  {
+    switch (exeDevice.exeType) {
+    case EXE_CPU:     return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    default:          return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
+    }
+  }
+
+} // End of anonymous namespace
+//========================================================================================
+
+  ErrResult::ErrResult(ErrType err) : errType(err), errMsg("") {};
+
+  ErrResult::ErrResult(hipError_t err)
+  {
+    if (err == hipSuccess) {
+      this->errType = ERR_NONE;
+      this->errMsg  = "";
+    } else {
+      this->errType = ERR_FATAL;
+      this->errMsg  = std::string("HIP Error: ") + hipGetErrorString(err);
+    }
+  }
+
+#if !defined(__NVCC__)
+  ErrResult::ErrResult(hsa_status_t err)
+  {
+    if (err == HSA_STATUS_SUCCESS) {
+      this->errType = ERR_NONE;
+      this->errMsg  = "";
+    } else {
+      const char *errString = NULL;
+      hsa_status_string(err, &errString);
+      this->errType = ERR_FATAL;
+      this->errMsg  = std::string("HSA Error: ") + errString;
+    }
+  }
+#endif
+
+  ErrResult::ErrResult(ErrType errType, const char* format, ...)
+  {
+    this->errType = errType;
+    va_list args, args_temp;
+    va_start(args, format);
+    va_copy(args_temp, args);
+
+    int len = vsnprintf(nullptr, 0, format, args);
+    if (len < 0) {
+      va_end(args_temp);
+      va_end(args);
+    } else {
+      this->errMsg.resize(len);
+      vsnprintf(this->errMsg.data(), len+1, format, args_temp);
+    }
+    va_end(args_temp);
+    va_end(args);
+  }
+
+  bool RunTransfers(ConfigOptions         const& cfg,
+                    std::vector<Transfer> const& transfers,
+                    TestResults&                 results)
+  {
+    // Clear all errors;
+    auto& errResults = results.errResults;
+    errResults.clear();
+
+    // Check for valid configuration
+    if (ConfigOptionsHaveErrors(cfg, errResults)) return false;
+
+    // Check for valid transfers
+    if (TransfersHaveErrors(cfg, transfers, errResults)) return false;
+
+    // Collect up transfers by executor
+    int minNumSrcs = MAX_SRCS + 1;
+    int maxNumSrcs = 0;
+    size_t maxNumBytes = 0;
+    std::map<ExeDevice, ExeInfo> executorMap;
+    for (int i = 0; i < transfers.size(); i++) {
+      Transfer const& t = transfers[i];
+
+      ExeInfo& exeInfo = executorMap[t.exeDevice];
+      exeInfo.totalBytes    += t.numBytes;
+      exeInfo.totalSubExecs += t.numSubExecs;
+      exeInfo.useSubIndices |= (t.exeSubIndex != -1);
+
+      TransferResources resource = {};
+      resource.transferIdx = i;
+      exeInfo.resources.push_back(resource);
+
+      minNumSrcs  = std::min(minNumSrcs, (int)t.srcs.size());
+      maxNumSrcs  = std::max(maxNumSrcs, (int)t.srcs.size());
+      maxNumBytes = std::max(maxNumBytes, t.numBytes);
+    }
+
+    // Loop over each executor and prepare
+    // - Allocates memory for each Transfer
+    // - Set up work for subexecutors
+    vector<TransferResources*> transferResources;
+    for (auto& exeInfoPair : executorMap) {
+      ExeDevice const& exeDevice = exeInfoPair.first;
+      ExeInfo&         exeInfo   = exeInfoPair.second;
+      ERR_APPEND(PrepareExecutor(cfg, transfers, exeDevice, exeInfo), errResults);
+
+      for (auto& resource : exeInfo.resources) {
+        transferResources.push_back(&resource);
+      }
+    }
+
+    // Prepare reference src/dst arrays - only once for largest size
+    size_t maxN = maxNumBytes / sizeof(float);
+    vector<float> outputBuffer(maxN);
+    vector<vector<float>> dstReference(maxNumSrcs + 1, vector<float>(maxN));
+    {
+      vector<vector<float>> srcReference(maxNumSrcs, vector<float>(maxN));
+      memset(dstReference[0].data(), MEMSET_CHAR, maxNumBytes);
+
+      for (int numSrcs = 0; numSrcs < maxNumSrcs; numSrcs++) {
+        PrepareReference(cfg, srcReference[numSrcs], numSrcs);
+        for (int i = 0; i < maxN; i++) {
+          dstReference[numSrcs+1][i] = (numSrcs == 0 ? 0 : dstReference[numSrcs][i]) + srcReference[numSrcs][i];
+        }
+      }
+      // Release un-used partial sums
+      for (int numSrcs = 0; numSrcs < minNumSrcs; numSrcs++)
+        dstReference[numSrcs].clear();
+
+      // Initialize all src memory buffers
+      for (auto resource : transferResources) {
+        for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) {
+          ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx], srcReference[srcIdx].data(), resource->numBytes,
+                               hipMemcpyDefault), errResults);
+        }
+      }
+    }
+
+    // Pause before starting when running in iteractive mode
+    if (cfg.general.useInteractive) {
+      printf("Memory prepared:\n");
+
+      for (int i = 0; i < transfers.size(); i++) {
+        ExeInfo const& exeInfo = executorMap[transfers[i].exeDevice];
+        printf("Transfer %03d:\n", i);
+        for (int iSrc = 0; iSrc < transfers[i].srcs.size(); ++iSrc)
+          printf("  SRC %0d: %p\n", iSrc, transferResources[i]->srcMem[iSrc]);
+        for (int iDst = 0; iDst < transfers[i].dsts.size(); ++iDst)
+          printf("  DST %0d: %p\n", iDst, transferResources[i]->dstMem[iDst]);
+      }
+      printf("Hit <Enter> to continue: ");
+      if (scanf("%*c") != 0) {
+        printf("[ERROR] Unexpected input\n");
+        exit(1);
+      }
+      printf("\n");
+    }
+
+    // Perform iterations
+    size_t numTimedIterations = 0;
+    double totalCpuTimeSec = 0.0;
+    for (int iteration = -cfg.general.numWarmups; ; iteration++) {
+      // Stop if number of iterations/seconds has reached limit
+      if (cfg.general.numIterations > 0 && iteration >= cfg.general.numIterations) break;
+      if (cfg.general.numIterations < 0 && totalCpuTimeSec > -cfg.general.numIterations) break;
+
+
+      // Start CPU timing for this iteration
+      auto cpuStart = std::chrono::high_resolution_clock::now();
+
+      // Execute all Transfers in parallel
+      std::vector<std::future<ErrResult>> asyncExecutors;
+      for (auto& exeInfoPair : executorMap) {
+        asyncExecutors.emplace_back(std::async(std::launch::async, RunExecutor,
+                                               iteration,
+                                               std::cref(cfg),
+                                               std::cref(exeInfoPair.first),
+                                               std::ref(exeInfoPair.second)));
+      }
+
+      // Wait for all threads to finish
+      for (auto& asyncExecutor : asyncExecutors) {
+        ERR_APPEND(asyncExecutor.get(), errResults);
+      }
+
+       // Stop CPU timing for this iteration
+      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+      double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+
+      if (cfg.data.alwaysValidate) {
+        ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
+                   errResults);
+      }
+
+      if (iteration >= 0) {
+        ++numTimedIterations;
+        totalCpuTimeSec += deltaSec;
+      }
+    }
+
+    // Pause for interactive mode
+    if (cfg.general.useInteractive) {
+      printf("Transfers complete. Hit <Enter> to continue: ");
+      if (scanf("%*c") != 0)  {
+        printf("[ERROR] Unexpected input\n");
+        exit(1);
+      }
+      printf("\n");
+    }
+
+    // Validate results
+    if (!cfg.data.alwaysValidate) {
+      ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
+                 errResults);
+    }
+
+    // Prepare results
+    results.exeResults.clear();
+    results.tfrResults.clear();
+    results.tfrResults.resize(transfers.size());
+    results.numTimedIterations = numTimedIterations;
+    results.totalBytesTransferred = 0;
+    results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / numTimedIterations;
+    results.overheadMsec = 0.0;
+    for (auto& exeInfoPair : executorMap) {
+      ExeDevice const& exeDevice = exeInfoPair.first;
+      ExeInfo&         exeInfo   = exeInfoPair.second;
+
+      // Copy over executor results
+      ExeResult& exeResult = results.exeResults[exeDevice];
+      exeResult.numBytes = exeInfo.totalBytes;
+      exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations;
+      exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) /  exeResult.avgDurationMsec;
+      exeResult.sumBandwidthGbPerSec = 0.0;
+      exeResult.transferIdx.clear();
+      results.totalBytesTransferred += exeInfo.totalBytes;
+      results.overheadMsec = std::max(results.overheadMsec, (results.avgTotalDurationMsec -
+                                                             exeResult.avgDurationMsec));
+
+      // Copy over transfer results
+      for (auto const& resources : exeInfo.resources) {
+        int const transferIdx = resources.transferIdx;
+        TransferResult& tfrResult = results.tfrResults[transferIdx];
+        exeResult.transferIdx.push_back(transferIdx);
+        tfrResult.numBytes = resources.numBytes;
+        tfrResult.avgDurationMsec = resources.totalDurationMsec / numTimedIterations;
+        tfrResult.avgBandwidthGbPerSec = (resources.numBytes / 1.0e6) / tfrResult.avgDurationMsec;
+        if (cfg.general.recordPerIteration) {
+          tfrResult.perIterMsec = resources.perIterMsec;
+          tfrResult.perIterCUs  = resources.perIterCUs;
+        }
+        exeResult.sumBandwidthGbPerSec += tfrResult.avgBandwidthGbPerSec;
+      }
+    }
+    results.avgTotalBandwidthGbPerSec = (results.totalBytesTransferred / 1.0e6) / results.avgTotalDurationMsec;
+
+    // Teardown executors
+    for (auto& exeInfoPair : executorMap) {
+      ExeDevice const& exeDevice = exeInfoPair.first;
+      ExeInfo&         exeInfo   = exeInfoPair.second;
+      ERR_APPEND(TeardownExecutor(cfg, exeDevice, transfers, exeInfo), errResults);
+    }
+
+    return true;
+  }
+
+  int GetIntAttribute(IntAttribute attribute)
+  {
+    switch (attribute) {
+    case ATR_GFX_MAX_BLOCKSIZE: return MAX_BLOCKSIZE;
+    case ATR_GFX_MAX_UNROLL:    return MAX_UNROLL;
+    default:                    return -1;
+    }
+  }
+
+  std::string GetStrAttribute(StrAttribute attribute)
+  {
+    switch (attribute) {
+    case ATR_SRC_PREP_DESCRIPTION:
+      return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
+    default:
+      return "";
+    }
+  }
+
+  ErrResult ParseTransfers(std::string            line,
+                           std::vector<Transfer>& transfers)
+  {
+    // Replace any round brackets or '->' with spaces,
+    for (int i = 1; line[i]; i++)
+      if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
+
+    transfers.clear();
+
+    // Read in number of transfers
+    int numTransfers = 0;
+    std::istringstream iss(line);
+    iss >> numTransfers;
+    if (iss.fail()) return ERR_NONE;
+
+    // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
+    // otherwise read triples (srcMem, exeMem, dstMem)
+    bool const advancedMode = (numTransfers < 0);
+    numTransfers = abs(numTransfers);
+
+    int numSubExecs;
+    std::string srcStr, exeStr, dstStr, numBytesToken;
+
+    if (!advancedMode) {
+      iss >> numSubExecs;
+      if (numSubExecs < 0 || iss.fail()) {
+        return {ERR_FATAL,
+                "Parsing error: Number of blocks to use (%d) must be non-negative", numSubExecs};
+      }
+    }
+
+    for (int i = 0; i < numTransfers; i++) {
+      Transfer transfer;
+
+      if (!advancedMode) {
+        iss >> srcStr >> exeStr >> dstStr;
+        transfer.numSubExecs = numSubExecs;
+        if (iss.fail()) {
+          return {ERR_FATAL,
+                  "Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet", i+1};
+        }
+      } else {
+        iss >> srcStr >> exeStr >> dstStr >> transfer.numSubExecs >> numBytesToken;
+        if (iss.fail()) {
+          return {ERR_FATAL,
+                  "Parsing error: Unable to read valid Transfer %d (SRC EXE DST $CU #Bytes) tuple", i+1};
+        }
+        if (sscanf(numBytesToken.c_str(), "%lu", &transfer.numBytes) != 1) {
+          return {ERR_FATAL,
+                  "Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple", i+1};
+        }
+
+        char units = numBytesToken.back();
+        switch (toupper(units)) {
+        case 'G': transfer.numBytes *= 1024;
+        case 'M': transfer.numBytes *= 1024;
+        case 'K': transfer.numBytes *= 1024;
+        }
+      }
+
+      ERR_CHECK(ParseMemType(srcStr, transfer.srcs));
+      ERR_CHECK(ParseMemType(dstStr, transfer.dsts));
+      ERR_CHECK(ParseExeType(exeStr, transfer.exeDevice, transfer.exeSubIndex));
+
+      transfers.push_back(transfer);
+    }
+    return ERR_NONE;
+  }
+
+  int GetNumExecutors(ExeType exeType)
+  {
+    switch (exeType) {
+    case EXE_CPU:
+      return numa_num_configured_nodes();
+    case EXE_GPU_GFX: case EXE_GPU_DMA:
+    {
+      int numDetectedGpus = 0;
+      hipError_t status = hipGetDeviceCount(&numDetectedGpus);
+      if (status != hipSuccess) numDetectedGpus = 0;
+      return numDetectedGpus;
+    }
+    default:
+      return 0;
+    }
+  }
+
+  int GetNumSubExecutors(ExeDevice exeDevice)
+  {
+    int const& exeIndex = exeDevice.exeIndex;
+
+    switch(exeDevice.exeType) {
+    case EXE_CPU:
+    {
+      int numCores = 0;
+      for (int i = 0; i < numa_num_configured_cpus(); i++)
+        if (numa_node_of_cpu(i) == exeIndex) numCores++;
+      return numCores;
+    }
+    case EXE_GPU_GFX:
+    {
+      int numGpus = GetNumExecutors(EXE_GPU_GFX);
+      if (exeIndex < 0 || numGpus <= exeIndex) return 0;
+      int numDeviceCUs = 0;
+      hipError_t status = hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex);
+      if (status != hipSuccess) numDeviceCUs = 0;
+      return numDeviceCUs;
+    }
+    case EXE_GPU_DMA:
+    {
+      return 1;
+    }
+    default:
+      return 0;
+    }
+  }
+
+  int GetNumExecutorSubIndices(ExeDevice exeDevice)
+  {
+    // Executor subindices are not supported on NVIDIA hardware
+#if defined(__NVCC__)
+    return 0;
+#else
+    int const& exeIndex = exeDevice.exeIndex;
+
+    switch(exeDevice.exeType) {
+    case EXE_CPU: return 0;
+    case EXE_GPU_GFX:
+    {
+      hsa_agent_t agent;
+      ErrResult err = GetHsaAgent(exeDevice, agent);
+      if (err.errType != ERR_NONE) return 0;
+      int numXccs = 1;
+      if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_XCC, &numXccs) != HSA_STATUS_SUCCESS)
+        return 1;
+      return numXccs;
+    }
+    case EXE_GPU_DMA:
+    {
+      std::set<int> engineIds;
+      ErrResult err;
+
+      // Get HSA agent for this GPU
+      hsa_agent_t agent;
+      err = GetHsaAgent(exeDevice, agent);
+      if (err.errType != ERR_NONE) return 0;
+
+      int numTotalEngines = 0, numEnginesA = 0, numEnginesB = 0;
+      if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_ENG, &numEnginesA)
+          == HSA_STATUS_SUCCESS)
+        numTotalEngines += numEnginesA;
+      if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG, &numEnginesB)
+          == HSA_STATUS_SUCCESS)
+        numTotalEngines += numEnginesB;
+
+      return numTotalEngines;
+    }
+    default:
+      return 0;
+    }
+#endif
+  }
+
+  int GetClosestCpuNumaToGpu(int gpuIndex)
+  {
+    // Closest NUMA is not supported on NVIDIA hardware at this time
+#if defined(__NVCC__)
+    return -1;
+#else
+    hsa_agent_t gpuAgent;
+    ErrResult err = GetHsaAgent({EXE_GPU_GFX, gpuIndex}, gpuAgent);
+    if (err.errType != ERR_NONE) return -1;
+
+    hsa_agent_t closestCpuAgent;
+    if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, &closestCpuAgent)
+        == HSA_STATUS_SUCCESS) {
+      int numCpus = GetNumExecutors(EXE_CPU);
+      for (int i = 0; i < numCpus; i++) {
+        hsa_agent_t cpuAgent;
+        err = GetHsaAgent({EXE_CPU, i}, cpuAgent);
+        if (err.errType != ERR_NONE) return -1;
+        if (cpuAgent.handle == closestCpuAgent.handle) return i;
+      }
+    }
+    return -1;
+#endif
+  }
+
+// Undefine CUDA compatibility macros
+#if defined(__NVCC__)
+
+// ROCm specific
+#undef wall_clock64
+#undef gcnArchName
+
+// Datatypes
+#undef hipDeviceProp_t
+#undef hipError_t
+#undef hipEvent_t
+#undef hipStream_t
+
+// Enumerations
+#undef hipDeviceAttributeClockRate
+#undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
+#undef hipDeviceAttributeMultiprocessorCount
+#undef hipErrorPeerAccessAlreadyEnabled
+#undef hipFuncCachePreferShared
+#undef hipMemcpyDefault
+#undef hipMemcpyDeviceToHost
+#undef hipMemcpyHostToDevice
+#undef hipSuccess
+
+// Functions
+#undef hipDeviceCanAccessPeer
+#undef hipDeviceEnablePeerAccess
+#undef hipDeviceGetAttribute
+#undef hipDeviceGetPCIBusId
+#undef hipDeviceSetCacheConfig
+#undef hipDeviceSynchronize
+#undef hipEventCreate
+#undef hipEventDestroy
+#undef hipEventElapsedTime
+#undef hipEventRecord
+#undef hipFree
+#undef hipGetDeviceCount
+#undef hipGetDeviceProperties
+#undef hipGetErrorString
+#undef hipHostFree
+#undef hipHostMalloc
+#undef hipMalloc
+#undef hipMallocManaged
+#undef hipMemcpy
+#undef hipMemcpyAsync
+#undef hipMemset
+#undef hipMemsetAsync
+#undef hipSetDevice
+#undef hipStreamCreate
+#undef hipStreamDestroy
+#undef hipStreamSynchronize
+#endif
+
+// Kernel macros
+#undef GetHwId
+#undef GetXccId
+
+// Undefine helper macros
+#undef ERR_CHECK
+#undef ERR_APPEND
+}
--- a/src/include/Compatibility.hpp
+++ b/src/include/Compatibility.hpp
-/*
-Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#pragma once
-
-// Helper macro for catching HIP errors
-#define HIP_CALL(cmd)                                                                   \
-    do {                                                                                \
-        hipError_t error = (cmd);                                                       \
-        if (error != hipSuccess)                                                        \
-        {                                                                               \
-            std::cerr << "Encountered HIP error (" << hipGetErrorString(error)          \
-                      << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
-            exit(-1);                                                                   \
-        }                                                                               \
-    } while (0)
-
-#if defined(__NVCC__)
-
-#include <cuda_runtime.h>
-
-// ROCm specific
-#define wall_clock64                                       clock64
-#define gcnArchName                                        name
-
-// Datatypes
-#define hipDeviceProp_t                                    cudaDeviceProp
-#define hipError_t                                         cudaError_t
-#define hipEvent_t                                         cudaEvent_t
-#define hipStream_t                                        cudaStream_t
-
-// Enumerations
-#define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
-#define hipDeviceAttributeMaxSharedMemoryPerMultiprocessor cudaDevAttrMaxSharedMemoryPerMultiprocessor
-#define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
-#define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
-#define hipFuncCachePreferShared                           cudaFuncCachePreferShared
-#define hipMemcpyDefault                                   cudaMemcpyDefault
-#define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
-#define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
-#define hipSuccess                                         cudaSuccess
-
-// Functions
-#define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
-#define hipDeviceEnablePeerAccess                          cudaDeviceEnablePeerAccess
-#define hipDeviceGetAttribute                              cudaDeviceGetAttribute
-#define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
-#define hipDeviceSetCacheConfig                            cudaDeviceSetCacheConfig
-#define hipDeviceSynchronize                               cudaDeviceSynchronize
-#define hipEventCreate                                     cudaEventCreate
-#define hipEventDestroy                                    cudaEventDestroy
-#define hipEventElapsedTime                                cudaEventElapsedTime
-#define hipEventRecord                                     cudaEventRecord
-#define hipFree                                            cudaFree
-#define hipGetDeviceCount                                  cudaGetDeviceCount
-#define hipGetDeviceProperties                             cudaGetDeviceProperties
-#define hipGetErrorString                                  cudaGetErrorString
-#define hipHostFree                                        cudaFreeHost
-#define hipHostMalloc                                      cudaMallocHost
-#define hipMalloc                                          cudaMalloc
-#define hipMallocManaged                                   cudaMallocManaged
-#define hipMemcpy                                          cudaMemcpy
-#define hipMemcpyAsync                                     cudaMemcpyAsync
-#define hipMemset                                          cudaMemset
-#define hipMemsetAsync                                     cudaMemsetAsync
-#define hipSetDevice                                       cudaSetDevice
-#define hipStreamCreate                                    cudaStreamCreate
-#define hipStreamDestroy                                   cudaStreamDestroy
-#define hipStreamSynchronize                               cudaStreamSynchronize
-
-// Define float4 addition operator for NVIDIA platform
-__device__ inline float4& operator +=(float4& a, const float4& b)
-{
-  a.x += b.x;
-  a.y += b.y;
-  a.z += b.z;
-  a.w += b.w;
-  return a;
-}
-
-#else
-
-#include <hip/hip_ext.h>
-#include <hip/hip_runtime.h>
-#include <hsa/hsa_ext_amd.h>
-
-#endif
--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
-/*
-Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#ifndef ENVVARS_HPP
-#define ENVVARS_HPP
-
-#include <algorithm>
-#include <random>
-#include <time.h>
-#include "Compatibility.hpp"
-#include "Kernels.hpp"
-
-#define TB_VERSION "1.52"
-
-extern char const MemTypeStr[];
-extern char const ExeTypeStr[];
-
-enum ConfigModeEnum
-{
-  CFG_FILE   = 0,
-  CFG_P2P    = 1,
-  CFG_SWEEP  = 2,
-  CFG_SCALE  = 3,
-  CFG_A2A    = 4,
-  CFG_SCHMOO = 5,
-  CFG_RWRITE = 6
-};
-
-enum BlockOrderEnum
-{
-  ORDER_SEQUENTIAL  = 0,
-  ORDER_INTERLEAVED = 1,
-  ORDER_RANDOM      = 2
-};
-
-// This class manages environment variable that affect TransferBench
-class EnvVars
-{
-public:
-  // Default configuration values
-  int const DEFAULT_NUM_WARMUPS       =  3;
-  int const DEFAULT_NUM_ITERATIONS    = 10;
-  int const DEFAULT_SAMPLING_FACTOR   =  1;
-
-  // Peer-to-peer Benchmark preset defaults
-  int const DEFAULT_P2P_NUM_CPU_SE    = 4;
-
-  // Sweep-preset defaults
-  std::string const DEFAULT_SWEEP_SRC = "CG";
-  std::string const DEFAULT_SWEEP_EXE = "CDG";
-  std::string const DEFAULT_SWEEP_DST = "CG";
-  int const DEFAULT_SWEEP_MIN         = 1;
-  int const DEFAULT_SWEEP_MAX         = 24;
-  int const DEFAULT_SWEEP_TEST_LIMIT  = 0;
-  int const DEFAULT_SWEEP_TIME_LIMIT  = 0;
-
-  // Environment variables
-  int alwaysValidate;    // Validate after each iteration instead of once after all iterations
-  int blockBytes;        // Each subexecutor, except the last, gets a multiple of this many bytes to copy
-  int blockOrder;        // How blocks are ordered in single-stream mode (0=Sequential, 1=Interleaved, 2=Random)
-  int byteOffset;        // Byte-offset for memory allocations
-  int continueOnError;   // Continue tests even after mismatch detected
-  int gfxBlockSize;      // Size of each threadblock (must be multiple of 64)
-  int gfxSingleTeam;     // Team all subExecutors across the data array
-  int gfxUnroll;         // GFX-kernel unroll factor
-  int gfxWaveOrder;      // GFX-kernel wavefront ordering
-  int hideEnv;           // Skip printing environment variable
-  int minNumVarSubExec;  // Minimum # of subexecutors to use for variable subExec Transfers
-  int maxNumVarSubExec;  // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
-  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
-  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
-  int numIterations;     // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
-  int numSubIterations;  // Number of subiterations to perform
-  int numWarmups;        // Number of un-timed warmup iterations to perform
-  int outputToCsv;       // Output in CSV format
-  int samplingFactor;    // Affects how many different values of N are generated (when N set to 0)
-  int sharedMemBytes;    // Amount of shared memory to use per threadblock
-  int showIterations;    // Show per-iteration timing info
-  int useHsaDma;         // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
-  int useInteractive;    // Pause for user-input before starting transfer loop
-  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
-  int usePrepSrcKernel;  // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
-  int useSingleStream;   // Use a single stream per GPU GFX executor instead of stream per Transfer
-  int useXccFilter;      // Use XCC filtering (experimental)
-  int validateDirect;    // Validate GPU destination memory directly instead of staging GPU memory on host
-
-  std::vector<float> fillPattern; // Pattern of floats used to fill source data
-  std::vector<uint32_t> cuMask;   // Bit-vector representing the CU mask
-  std::vector<std::vector<int>> prefXccTable;
-
-  // Environment variables only for P2P preset
-  int numCpuSubExecs;    // Number of CPU subexecttors to use
-  int numGpuSubExecs;    // Number of GPU subexecutors to use
-  int p2pMode;           // Both = 0, Unidirectional = 1, Bidirectional = 2
-  int useDmaCopy;        // Use DMA copy instead of GPU copy
-  int useRemoteRead;     // Use destination memory type as executor instead of source memory type
-  int useFineGrain;      // Use fine-grained memory
-
-  // Environment variables only for Sweep-preset
-  int sweepMin;          // Min number of simultaneous Transfers to be executed per test
-  int sweepMax;          // Max number of simulatneous Transfers to be executed per test
-  int sweepTestLimit;    // Max number of tests to run during sweep (0 = no limit)
-  int sweepTimeLimit;    // Max number of seconds to run sweep for  (0 = no limit)
-  int sweepXgmiMin;      // Min number of XGMI hops for Transfers
-  int sweepXgmiMax;      // Max number of XGMI hops for Transfers (-1 = no limit)
-  int sweepSeed;         // Random seed to use
-  int sweepRandBytes;    // Whether or not to use random number of bytes per Transfer
-  std::string sweepSrc;  // Set of src memory types to be swept
-  std::string sweepExe;  // Set of executors to be swept
-  std::string sweepDst;  // Set of dst memory types to be swept
-
-  // Enviroment variables only for A2A preset
-  int a2aDirect;         // Only execute on links that are directly connected
-  int a2aMode;           // Perform 0=copy, 1=read only, 2 = write only
-
-  // Developer features
-  int enableDebug;       // Enable debug output
-  int gpuMaxHwQueues;    // Tracks GPU_MAX_HW_QUEUES environment variable
-
-  // Used to track current configuration mode
-  ConfigModeEnum configMode;
-
-  // Random generator
-  std::default_random_engine *generator;
-
-  // Track how many CPUs are available per NUMA node
-  std::vector<int> numCpusPerNuma;
-
-  std::vector<int> wallClockPerDeviceMhz;
-
-  std::vector<std::set<int>> xccIdsPerDevice;
-
-  // Constructor that collects values
-  EnvVars()
-  {
-    int maxSharedMemBytes = 0;
-    HIP_CALL(hipDeviceGetAttribute(&maxSharedMemBytes,
-                                   hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0));
-#if !defined(__NVCC__)
-    int defaultSharedMemBytes = maxSharedMemBytes / 2 + 1;
-#else
-    int defaultSharedMemBytes = 0;
-#endif
-
-    int numDeviceCUs = 0;
-    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, 0));
-
-    int numDetectedCpus = numa_num_configured_nodes();
-    int numDetectedGpus;
-    HIP_CALL(hipGetDeviceCount(&numDetectedGpus));
-
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, 0));
-    std::string fullName = prop.gcnArchName;
-    std::string archName = fullName.substr(0, fullName.find(':'));
-
-    // Different hardware pick different GPU kernels
-    // This performance difference is generally only noticable when executing fewer CUs
-    int defaultGfxUnroll = 4;
-    if      (archName == "gfx906") defaultGfxUnroll = 8;
-    else if (archName == "gfx90a") defaultGfxUnroll = 8;
-    else if (archName == "gfx940") defaultGfxUnroll = 6;
-    else if (archName == "gfx941") defaultGfxUnroll = 6;
-    else if (archName == "gfx942") defaultGfxUnroll = 4;
-
-    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
-    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
-    blockOrder        = GetEnvVar("BLOCK_ORDER"         , 0);
-    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
-    continueOnError   = GetEnvVar("CONTINUE_ON_ERROR"   , 0);
-    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
-    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
-    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
-    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
-    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
-    minNumVarSubExec  = GetEnvVar("MIN_VAR_SUBEXEC"     , 1);
-    maxNumVarSubExec  = GetEnvVar("MAX_VAR_SUBEXEC"     , 0);
-    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
-    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
-    numIterations     = GetEnvVar("NUM_ITERATIONS"      , DEFAULT_NUM_ITERATIONS);
-    numSubIterations  = GetEnvVar("NUM_SUBITERATIONS"   , 1);
-    numWarmups        = GetEnvVar("NUM_WARMUPS"         , DEFAULT_NUM_WARMUPS);
-    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
-    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , DEFAULT_SAMPLING_FACTOR);
-    sharedMemBytes    = GetEnvVar("SHARED_MEM_BYTES"    , defaultSharedMemBytes);
-    showIterations    = GetEnvVar("SHOW_ITERATIONS"     , 0);
-    useHsaDma         = GetEnvVar("USE_HSA_DMA"         , 0);
-    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
-    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
-    usePrepSrcKernel  = GetEnvVar("USE_PREP_KERNEL"     , 0);
-    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 1);
-    useXccFilter      = GetEnvVar("USE_XCC_FILTER"      , 0);
-    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
-    enableDebug       = GetEnvVar("DEBUG"               , 0);
-    gpuMaxHwQueues    = GetEnvVar("GPU_MAX_HW_QUEUES"   , 4);
-
-    // P2P Benchmark related
-    useDmaCopy        = GetEnvVar("USE_GPU_DMA"         , 0); // Needed for numGpuSubExec
-
-    numCpuSubExecs    = GetEnvVar("NUM_CPU_SE"          , DEFAULT_P2P_NUM_CPU_SE);
-    numGpuSubExecs    = GetEnvVar("NUM_GPU_SE"          , useDmaCopy ? 1 : numDeviceCUs);
-    p2pMode           = GetEnvVar("P2P_MODE"            , 0);
-    useRemoteRead     = GetEnvVar("USE_REMOTE_READ"     , 0);
-    useFineGrain      = GetEnvVar("USE_FINE_GRAIN"      , 0);
-
-    // Sweep related
-    sweepMin          = GetEnvVar("SWEEP_MIN"           , DEFAULT_SWEEP_MIN);
-    sweepMax          = GetEnvVar("SWEEP_MAX"           , DEFAULT_SWEEP_MAX);
-    sweepSrc          = GetEnvVar("SWEEP_SRC"           , DEFAULT_SWEEP_SRC);
-    sweepExe          = GetEnvVar("SWEEP_EXE"           , DEFAULT_SWEEP_EXE);
-    sweepDst          = GetEnvVar("SWEEP_DST"           , DEFAULT_SWEEP_DST);
-    sweepTestLimit    = GetEnvVar("SWEEP_TEST_LIMIT"    , DEFAULT_SWEEP_TEST_LIMIT);
-    sweepTimeLimit    = GetEnvVar("SWEEP_TIME_LIMIT"    , DEFAULT_SWEEP_TIME_LIMIT);
-    sweepXgmiMin      = GetEnvVar("SWEEP_XGMI_MIN"      , 0);
-    sweepXgmiMax      = GetEnvVar("SWEEP_XGMI_MAX"      , -1);
-    sweepRandBytes    = GetEnvVar("SWEEP_RAND_BYTES"    , 0);
-
-    // A2A Benchmark related
-    a2aDirect         = GetEnvVar("A2A_DIRECT"          , 1);
-    a2aMode           = GetEnvVar("A2A_MODE"            , 0);
-
-    // Determine random seed
-    char *sweepSeedStr = getenv("SWEEP_SEED");
-    sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL));
-    generator = new std::default_random_engine(sweepSeed);
-
-    // Check for fill pattern
-    char* pattern = getenv("FILL_PATTERN");
-    if (pattern != NULL)
-    {
-      if (usePrepSrcKernel)
-      {
-        printf("[ERROR] Unable to use FILL_PATTERN and USE_PREP_KERNEL together\n");
-        exit(1);
-      }
-
-      int patternLen = strlen(pattern);
-      if (patternLen % 2)
-      {
-        printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
-        exit(1);
-      }
-
-      // Read in bytes
-      std::vector<unsigned char> bytes;
-      unsigned char val = 0;
-      for (int i = 0; i < patternLen; i++)
-      {
-        if ('0' <= pattern[i] && pattern[i] <= '9')
-          val += (pattern[i] - '0');
-        else if ('A' <= pattern[i] && pattern[i] <= 'F')
-          val += (pattern[i] - 'A' + 10);
-        else if ('a' <= pattern[i] && pattern[i] <= 'f')
-          val += (pattern[i] - 'a' + 10);
-        else
-        {
-          printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F).  (not %c)\n", pattern[i]);
-          exit(1);
-        }
-
-        if (i % 2 == 0)
-          val <<= 4;
-        else
-        {
-          bytes.push_back(val);
-          val = 0;
-        }
-      }
-
-      // Reverse bytes (input is assumed to be given in big-endian)
-      std::reverse(bytes.begin(), bytes.end());
-
-      // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
-      int copies;
-      switch (patternLen % 8)
-      {
-      case 0:  copies = 1; break;
-      case 4:  copies = 2; break;
-      default: copies = 4; break;
-      }
-
-      // Fill floats
-      int numFloats = copies * patternLen / 8;
-      fillPattern.resize(numFloats);
-      unsigned char* rawData = (unsigned char*) fillPattern.data();
-      for (int i = 0; i < numFloats * 4; i++)
-        rawData[i] = bytes[i % bytes.size()];
-    }
-    else fillPattern.clear();
-
-    // Figure out number of xccs per device
-    int maxNumXccs = 64;
-    xccIdsPerDevice.resize(numGpuDevices);
-    for (int i = 0; i < numGpuDevices; i++)
-    {
-      int* data;
-      HIP_CALL(hipSetDevice(i));
-      HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
-      CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
-      HIP_CALL(hipDeviceSynchronize());
-
-      xccIdsPerDevice[i].clear();
-      for (int j = 0; j < maxNumXccs; j++)
-        xccIdsPerDevice[i].insert(data[j]);
-
-      HIP_CALL(hipHostFree(data));
-    }
-
-    // Check for CU mask
-    cuMask.clear();
-    char* cuMaskStr = getenv("CU_MASK");
-    if (cuMaskStr != NULL)
-    {
-#if defined(__NVCC__)
-      printf("[WARN] CU_MASK is not supported in CUDA\n");
-#else
-      std::vector<std::pair<int, int>> ranges;
-      int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
-      int maxCU = 0;
-      char* token = strtok(cuMaskStr, ",");
-      while (token)
-      {
-        int start, end;
-        if (sscanf(token, "%d-%d", &start, &end) == 2)
-        {
-          ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
-          maxCU = std::max(maxCU, std::max(start, end));
-        }
-        else if (sscanf(token, "%d", &start) == 1)
-        {
-          ranges.push_back(std::make_pair(start, start));
-          maxCU = std::max(maxCU, start);
-        }
-        else
-        {
-          printf("[ERROR] Unrecognized token [%s]\n", token);
-          exit(1);
-        }
-        token = strtok(NULL, ",");
-      }
-      cuMask.resize(2 * numXccs, 0);
-
-      for (auto range : ranges)
-      {
-        for (int i = range.first; i <= range.second; i++)
-        {
-          for (int x = 0; x < numXccs; x++)
-          {
-            int targetBit = i * numXccs + x;
-            cuMask[targetBit/32] |= (1<<(targetBit%32));
-          }
-        }
-      }
-#endif
-    }
-
-    // Parse preferred XCC table (if provided
-    prefXccTable.resize(numGpuDevices);
-    for (int i = 0; i < numGpuDevices; i++)
-    {
-      prefXccTable[i].resize(numGpuDevices, -1);
-    }
-
-    char* prefXccStr = getenv("XCC_PREF_TABLE");
-    if (prefXccStr)
-    {
-      char* token = strtok(prefXccStr, ",");
-      int tokenCount = 0;
-      while (token)
-      {
-        int xccId;
-        if (sscanf(token, "%d", &xccId) == 1)
-        {
-          int src = tokenCount / numGpuDevices;
-          int dst = tokenCount % numGpuDevices;
-          if (xccIdsPerDevice[src].count(xccId) == 0)
-          {
-            printf("[ERROR] GPU %d does not contain XCC %d\n", src, xccId);
-            exit(1);
-          }
-          prefXccTable[src][dst] = xccId;
-
-          tokenCount++;
-          if (tokenCount == (numGpuDevices * numGpuDevices)) break;
-        }
-        else
-        {
-          printf("[ERROR] Unrecognized token [%s]\n", token);
-          exit(1);
-        }
-        token = strtok(NULL, ",");
-      }
-    }
-
-    // Perform some basic validation
-    if (numCpuDevices > numDetectedCpus)
-    {
-      printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
-      exit(1);
-    }
-    if (numGpuDevices > numDetectedGpus)
-    {
-      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
-      exit(1);
-    }
-    if (gfxBlockSize % 64)
-    {
-      printf("[ERROR] GFX_BLOCK_SIZE (%d) must be a multiple of 64\n", gfxBlockSize);
-      exit(1);
-    }
-    if (gfxBlockSize > MAX_BLOCKSIZE)
-    {
-      printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", gfxBlockSize, MAX_BLOCKSIZE);
-      exit(1);
-    }
-    if (byteOffset % sizeof(float))
-    {
-      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
-      exit(1);
-    }
-    if (blockOrder < 0 || blockOrder > 2)
-    {
-      printf("[ERROR] BLOCK_ORDER must be 0 (Sequential), 1 (Interleaved), or 2 (Random)\n");
-      exit(1);
-    }
-    if (minNumVarSubExec  < 1)
-    {
-      printf("[ERROR] Minimum number of subexecutors for variable subexector transfers must be at least 1\n");
-      exit(1);
-    }
-    if (numWarmups < 0)
-    {
-      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
-      exit(1);
-    }
-    if (samplingFactor < 1)
-    {
-      printf("[ERROR] SAMPLING_FACTOR must be greater or equal to 1\n");
-      exit(1);
-    }
-    if (sharedMemBytes < 0 || sharedMemBytes > maxSharedMemBytes)
-    {
-      printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n", maxSharedMemBytes);
-      exit(1);
-    }
-    if (blockBytes <= 0 || blockBytes % 4)
-    {
-      printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
-      exit(1);
-    }
-    if (numGpuSubExecs <= 0)
-    {
-      printf("[ERROR] NUM_GPU_SE must be greater than 0\n");
-      exit(1);
-    }
-
-    if (numCpuSubExecs <= 0)
-    {
-      printf("[ERROR] NUM_CPU_SE must be greater than 0\n");
-      exit(1);
-    }
-
-    for (auto ch : sweepSrc)
-    {
-      if (!strchr(MemTypeStr, ch))
-      {
-        printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
-        exit(1);
-      }
-      if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
-      {
-        printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
-        exit(1);
-      }
-    }
-
-    for (auto ch : sweepDst)
-    {
-      if (!strchr(MemTypeStr, ch))
-      {
-        printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
-        exit(1);
-      }
-      if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
-      {
-        printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
-        exit(1);
-      }
-    }
-
-    for (auto ch : sweepExe)
-    {
-      if (!strchr(ExeTypeStr, ch))
-      {
-        printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
-        exit(1);
-      }
-      if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
-      {
-        printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
-        exit(1);
-      }
-    }
-
-    if (a2aMode < 0 || a2aMode > 2)
-    {
-      printf("[ERROR] a2aMode must be between 0 and 2\n");
-      exit(1);
-    }
-
-    if (gfxUnroll < 1 || gfxUnroll > MAX_UNROLL)
-    {
-      printf("[ERROR] GFX kernel unroll factor must be between 1 and %d (Not %d)\n", MAX_UNROLL, gfxUnroll);
-      exit(1);
-    }
-
-    if (gfxWaveOrder < 0 || gfxWaveOrder >= 6)
-    {
-      printf("[ERROR] GFX wave order must be between 0 and 5\n");
-      exit(1);
-    }
-
-    // Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
-    numCpusPerNuma.resize(numDetectedCpus);
-    int const totalCpus = numa_num_configured_cpus();
-    for (int i = 0; i < totalCpus; i++) {
-      int node = numa_node_of_cpu(i);
-      if (node >= 0) numCpusPerNuma[node]++;
-    }
-
-    // Build array of wall clock rates per GPU device
-    wallClockPerDeviceMhz.resize(numDetectedGpus);
-    for (int i = 0; i < numDetectedGpus; i++)
-    {
-#if defined(__NVCC__)
-      wallClockPerDeviceMhz[i] = 1000000;
-#else
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, i));
-      int value = 25000;
-      std::string fullName = prop.gcnArchName;
-      std::string archName = fullName.substr(0, fullName.find(':'));
-      if (archName == "gfx940" || archName == "gfx941" || archName == "gfx942")
-        wallClockPerDeviceMhz[i] = 100000;
-      else
-        wallClockPerDeviceMhz[i] = 25000;
-#endif
-    }
-
-    // Check for deprecated env vars
-    if (getenv("USE_HIP_CALL"))
-    {
-      printf("[WARN] USE_HIP_CALL has been deprecated.  Please use DMA executor 'D' or set USE_GPU_DMA for P2P-Benchmark preset\n");
-      exit(1);
-    }
-
-    if (getenv("GPU_KERNEL"))
-    {
-      printf("[WARN] GPU_KERNEL has been deprecated and replaced by GFX_KERNEL and GFX_UNROLL\n");
-      exit(1);
-    }
-
-    char* enableSdma = getenv("HSA_ENABLE_SDMA");
-    if (enableSdma && !strcmp(enableSdma, "0"))
-    {
-      printf("[WARN] DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0.  Copies will fallback to blit kernels\n");
-    }
-  }
-
-  // Display info on the env vars that can be used
-  static void DisplayUsage()
-  {
-    printf("Environment variables:\n");
-    printf("======================\n");
-    printf(" ALWAYS_VALIDATE        - Validate after each iteration instead of once after all iterations\n");
-    printf(" BLOCK_SIZE             - # of threads per threadblock (Must be multiple of 64). Defaults to 256\n");
-    printf(" BLOCK_BYTES            - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
-    printf(" BLOCK_ORDER            - Threadblock ordering in single-stream mode (0=Serial, 1=Interleaved, 2=Random)\n");
-    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
-    printf(" CONTINUE_ON_ERROR      - Continue tests even after mismatch detected\n");
-    printf(" CU_MASK                - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
-    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
-    printf(" GFX_UNROLL             - Unroll factor for GFX kernel (0=auto), must be less than %d\n", MAX_UNROLL);
-    printf(" GFX_SINGLE_TEAM        - Have subexecutors work together on full array instead of working on individual disjoint subarrays\n");
-    printf(" GFX_WAVE_ORDER         - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
-    printf(" HIDE_ENV               - Hide environment variable value listing\n");
-    printf(" MIN_VAR_SUBEXEC        - Minumum # of subexecutors to use for variable subExec Transfers\n");
-    printf(" MAX_VAR_SUBEXEC        - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
-    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
-    printf(" NUM_GPU_DEVICES=X      - Restrict number of GPUs to X.  May not be greater than # detected HIP devices\n");
-    printf(" NUM_ITERATIONS=I       - Perform I timed iteration(s) per test\n");
-    printf(" NUM_SUBITERATIONS=S    - Perform S sub-iteration(s) per iteration. Must be non-negative\n");
-    printf(" NUM_WARMUPS=W          - Perform W untimed warmup iteration(s) per test\n");
-    printf(" OUTPUT_TO_CSV          - Outputs to CSV format if set\n");
-    printf(" SAMPLING_FACTOR=F      - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
-    printf(" SHARED_MEM_BYTES=X     - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
-    printf(" SHOW_ITERATIONS        - Show per-iteration timing info\n");
-    printf(" USE_HSA_DMA            - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
-    printf(" USE_INTERACTIVE        - Pause for user-input before starting transfer loop\n");
-    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
-    printf(" USE_PREP_KERNEL        - Use GPU kernel to initialize source data array pattern\n");
-    printf(" USE_SINGLE_STREAM      - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
-    printf(" USE_XCC_FILTER         - Use XCC filtering (experimental)\n");
-    printf(" VALIDATE_DIRECT        - Validate GPU destination memory directly instead of staging GPU memory on host\n");
-  }
-
-  // Helper macro to switch between CSV and terminal output
-#define PRINT_EV(NAME, VALUE, DESCRIPTION)                              \
-  printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
-
-#define PRINT_ES(NAME, VALUE, DESCRIPTION)                           \
-  printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ",  (DESCRIPTION).c_str())
-
-  // Display env var settings
-  void DisplayEnvVars() const
-  {
-    if (!outputToCsv)
-    {
-      printf("TransferBench v%s\n", TB_VERSION);
-      printf("===============================================================\n");
-      if (!hideEnv) printf("[Common]                              (Suppress by setting HIDE_ENV=1)\n");
-    }
-    else if (!hideEnv)
-      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
-    if (hideEnv) return;
-
-    PRINT_EV("ALWAYS_VALIDATE", alwaysValidate,
-             std::string("Validating after ") + (alwaysValidate ? "each iteration" : "all iterations"));
-    PRINT_EV("BLOCK_BYTES", blockBytes,
-             std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
-    PRINT_EV("BLOCK_ORDER", blockOrder,
-             std::string("Transfer blocks order: " + std::string((blockOrder == 0 ? "Sequential"  :
-                                                                  blockOrder == 1 ? "Interleaved" :
-                                                                                    "Random"))));
-    PRINT_EV("BYTE_OFFSET", byteOffset,
-             std::string("Using byte offset of " + std::to_string(byteOffset)));
-    PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
-             std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
-    PRINT_EV("CU_MASK", getenv("CU_MASK") ? 1 : 0,
-             (cuMask.size() ? GetCuMaskDesc() : "All"));
-    PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
-             (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
-    PRINT_EV("GFX_BLOCK_SIZE", gfxBlockSize,
-             std::string("Threadblock size of " + std::to_string(gfxBlockSize)));
-    PRINT_EV("GFX_SINGLE_TEAM", gfxSingleTeam,
-             (gfxSingleTeam ? std::string("Combining CUs to work across entire data array") :
-                              std::string("Each CUs operates on its own disjoint subarray")));
-    PRINT_EV("GFX_UNROLL", gfxUnroll,
-             std::string("Using GFX unroll factor of ") + std::to_string(gfxUnroll));
-    PRINT_EV("GFX_WAVE_ORDER", gfxWaveOrder,
-             std::string("Using GFX wave ordering of ") + std::string((gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
-                                                                       gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
-                                                                       gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
-                                                                       gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
-                                                                       gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
-                                                                                           "CU,Wavefront,Unroll")));
-    PRINT_EV("MIN_VAR_SUBEXEC", minNumVarSubExec,
-             std::string("Using at least ") + std::to_string(minNumVarSubExec) + " subexecutor(s) for variable subExec tranfers");
-    PRINT_EV("MAX_VAR_SUBEXEC", maxNumVarSubExec,
-             maxNumVarSubExec ?
-             std::string("Using at most ") + std::to_string(maxNumVarSubExec) + " subexecutor(s) for variable subExec tranfers" :
-             "Using up to maximum device subexecutors for variable subExec tranfers");
-    PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
-             std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
-    PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
-             std::string("Using ") + std::to_string(numGpuDevices) + " GPU devices");
-    PRINT_EV("NUM_ITERATIONS", numIterations,
-             std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
-             + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
-    PRINT_EV("NUM_SUBITERATIONS", numSubIterations,
-             std::string("Running ") + (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)) + " subiterations");
-    PRINT_EV("NUM_WARMUPS", numWarmups,
-             std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
-    PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
-             std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
-    PRINT_EV("SHOW_ITERATIONS", showIterations,
-             std::string(showIterations ? "Showing" : "Hiding") + " per-iteration timing");
-    PRINT_EV("USE_HSA_DMA", useHsaDma,
-             std::string("Using ") + (useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync") + " for DMA execution");
-    PRINT_EV("USE_INTERACTIVE", useInteractive,
-             std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
-    PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
-             std::string("Use ") + (usePcieIndexing ? "PCIe" : "HIP") + " GPU device indexing");
-    PRINT_EV("USE_PREP_KERNEL", usePrepSrcKernel,
-             std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
-    PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
-             std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
-    PRINT_EV("USE_XCC_FILTER", useXccFilter,
-             std::string("XCC filtering ") + (useXccFilter ? "enabled" : "disabled"));
-    if (useXccFilter)
-    {
-      printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
-      printf("%36s:         ", "");
-      for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
-      for (int i = 0; i < numGpuDevices; i++)
-      {
-        printf("%36s: GPU %3d ", "", i);
-        for (int j = 0; j < numGpuDevices; j++)
-          printf(" %3d", prefXccTable[i][j]);
-        printf(" %3lu\n", xccIdsPerDevice[i].size());
-      }
-    }
-    PRINT_EV("VALIDATE_DIRECT", validateDirect,
-             std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
-    printf("\n");
-
-    if (blockOrder != ORDER_SEQUENTIAL && !useSingleStream)
-      printf("[WARN] BLOCK_ORDER is ignored if USE_SINGLE_STREAM is not enabled\n");
-  };
-
-  // Display env var for P2P Benchmark preset
-  void DisplayP2PBenchmarkEnvVars() const
-  {
-    DisplayEnvVars();
-
-    if (hideEnv) return;
-
-    if (!outputToCsv)
-      printf("[P2P Related]\n");
-
-    PRINT_EV("NUM_CPU_SE", numCpuSubExecs,
-             std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
-    PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
-             std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
-    PRINT_EV("P2P_MODE", p2pMode,
-             std::string("Running ") + (p2pMode == 1 ? "Unidirectional" :
-                                        p2pMode == 2 ? "Bidirectional"  :
-                                                       "Unidirectional + Bidirectional"));
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-
-    PRINT_EV("USE_GPU_DMA", useDmaCopy,
-             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
-    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
-             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
-    printf("\n");
-  }
-
-  // Display env var settings
-  void DisplaySweepEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-
-    if (!outputToCsv)
-      printf("[Sweep Related]\n");
-    PRINT_ES("SWEEP_DST", sweepDst.c_str(),
-             std::string("Destination Memory Types to sweep"));
-    PRINT_ES("SWEEP_EXE", sweepExe.c_str(),
-             std::string("Executor Types to sweep"));
-    PRINT_EV("SWEEP_MAX", sweepMax,
-             std::string("Max simultaneous transfers (0 = no limit)"));
-    PRINT_EV("SWEEP_MIN", sweepMin,
-             std::string("Min simultaenous transfers"));
-    PRINT_EV("SWEEP_RAND_BYTES", sweepRandBytes,
-             std::string("Using ") + (sweepRandBytes ? "random" : "constant") + " number of bytes per Transfer");
-    PRINT_EV("SWEEP_SEED", sweepSeed,
-             std::string("Random seed set to ") + std::to_string(sweepSeed));
-    PRINT_ES("SWEEP_SRC", sweepSrc.c_str(),
-             std::string("Source Memory Types to sweep"));
-    PRINT_EV("SWEEP_TEST_LIMIT", sweepTestLimit,
-             std::string("Max number of tests to run during sweep (0 = no limit)"));
-    PRINT_EV("SWEEP_TIME_LIMIT", sweepTimeLimit,
-             std::string("Max number of seconds to run sweep for  (0 = no limit)"));
-    PRINT_EV("SWEEP_XGMI_MAX", sweepXgmiMax,
-             std::string("Max number of XGMI hops for Transfers (-1 = no limit)"));
-    PRINT_EV("SWEEP_XGMI_MIN", sweepXgmiMin,
-             std::string("Min number of XGMI hops for Transfers"));
-    printf("\n");
-  }
-
-  void DisplayA2AEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[AllToAll Related]\n");
-    PRINT_EV("A2A_DIRECT", a2aDirect,
-             std::string(a2aDirect ? "Only using direct links" : "Full all-to-all"));
-    PRINT_EV("A2A_MODE", a2aMode,
-             std::string(a2aMode == 0 ? "Perform copy" :
-                         a2aMode == 1 ? "Perform read-only" :
-                                        "Perform write-only"));
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-    PRINT_EV("USE_GPU_DMA", useDmaCopy,
-             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
-    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
-             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
-
-    printf("\n");
-  }
-
-  void DisplaySchmooEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[Schmoo Related]\n");
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-  }
-
-  void DisplayRemoteWriteEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[Remote-Write Related]\n");
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
-             std::string("Performing remote ") + (useRemoteRead ? "reads" : "writes"));
-    printf("\n");
-  }
-
-  void DisplayParallelCopyEnvVars() const
-  {
-    DisplayEnvVars();
-    if (hideEnv) return;
-    if (!outputToCsv)
-      printf("[Parallel-copy Related]\n");
-    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
-             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
-    PRINT_EV("USE_GPU_DMA", useDmaCopy,
-             std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
-    printf("\n");
-  }
-
-  // Helper function that gets parses environment variable or sets to default value
-  static int GetEnvVar(std::string const& varname, int defaultValue)
-  {
-    if (getenv(varname.c_str()))
-      return atoi(getenv(varname.c_str()));
-    return defaultValue;
-  }
-
-  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
-  {
-    if (getenv(varname.c_str()))
-      return getenv(varname.c_str());
-    return defaultValue;
-  }
-
-  std::string GetCuMaskDesc() const
-  {
-    std::vector<std::pair<int, int>> runs;
-    int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
-    bool inRun = false;
-    std::pair<int, int> curr;
-    int used = 0;
-    for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
-      if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
-        used++;
-        if (!inRun) {
-          inRun = true;
-          curr.first = targetBit / numXccs;
-        }
-      } else {
-        if (inRun) {
-          inRun = false;
-          curr.second = targetBit / numXccs - 1;
-          runs.push_back(curr);
-        }
-      }
-    }
-    if (inRun)
-      curr.second = (cuMask.size() * 32) / numXccs - 1;
-
-    std::string result = "CUs used: (" + std::to_string(used) + ") ";
-    for (int i = 0; i < runs.size(); i++)
-    {
-      if (i) result += ",";
-      if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
-      else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
-    }
-    return result;
-  }
-};
-
-#endif
--- a/src/include/GetClosestNumaNode.hpp
+++ b/src/include/GetClosestNumaNode.hpp
-/*
-Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#pragma once
-
-// Helper macro for checking HSA calls
-#define HSA_CHECK(cmd)                                                  \
-  do {                                                                  \
-    hsa_status_t error = (cmd);                                         \
-    if (error != HSA_STATUS_SUCCESS) {                                  \
-      const char* errString = NULL;                                     \
-      hsa_status_string(error, &errString);                             \
-      std::cerr << "Encountered HSA error (" << errString << ") at line " \
-                << __LINE__ << " in file " << __FILE__ << "\n";         \
-      exit(-1);                                                         \
-    }                                                                   \
-  } while (0)
-
-// Structure to hold HSA agent information
-#if !defined(__NVCC__)
-struct AgentData
-{
-  bool isInitialized;
-  std::vector<hsa_agent_t> cpuAgents;
-  std::vector<hsa_agent_t> gpuAgents;
-  std::vector<int> closestNumaNode;
-};
-
-// Simple callback function to return any memory pool for an agent
-hsa_status_t MemPoolInfoCallback(hsa_amd_memory_pool_t pool, void *data)
-{
-  hsa_amd_memory_pool_t* poolData = reinterpret_cast<hsa_amd_memory_pool_t*>(data);
-
-  // Check memory pool flags
-  uint32_t poolFlags;
-  HSA_CHECK(hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &poolFlags));
-
-  // Only consider coarse-grained pools
-  if (!(poolFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) return HSA_STATUS_SUCCESS;
-
-  *poolData = pool;
-  return HSA_STATUS_SUCCESS;
-}
-
-// Callback function to gather HSA agent information
-hsa_status_t AgentInfoCallback(hsa_agent_t agent, void* data)
-{
-  AgentData* agentData = reinterpret_cast<AgentData*>(data);
-
-  // Get the device type
-  hsa_device_type_t deviceType;
-  HSA_CHECK(hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType));
-  if (deviceType == HSA_DEVICE_TYPE_CPU)
-    agentData->cpuAgents.push_back(agent);
-  if (deviceType == HSA_DEVICE_TYPE_GPU)
-  {
-    agentData->gpuAgents.push_back(agent);
-    agentData->closestNumaNode.push_back(0);
-  }
-
-  return HSA_STATUS_SUCCESS;
-}
-
-AgentData& GetAgentData()
-{
-  static AgentData agentData = {};
-
-  if (!agentData.isInitialized) {
-    agentData.isInitialized = true;
-
-    // Add all detected agents to the list
-    HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
-
-    // Loop over each GPU
-    for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++) {
-      // Collect memory pool
-      hsa_amd_memory_pool_t pool;
-      HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
-
-      // Loop over each CPU agent and check distance
-      agentData.closestNumaNode[i] = 0;
-      int bestDistance = -1;
-      for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++) {
-        // Determine number of hops from GPU memory pool to CPU agent
-        uint32_t hops = 0;
-        HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
-                                                     pool,
-                                                     HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
-                                                     &hops));
-        // Gather link info
-        if (hops) {
-          hsa_amd_memory_pool_link_info_t* link_info =
-            (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
-          HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
-                                                       pool,
-                                                       HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
-                                                       link_info));
-          int numaDist = 0;
-          for (int k = 0; k < hops; k++)
-            numaDist += link_info[k].numa_distance;
-
-          if (bestDistance == -1 || numaDist < bestDistance) {
-            agentData.closestNumaNode[i] = j;
-            bestDistance = numaDist;
-          }
-          free(link_info);
-        }
-      }
-    }
-  }
-  return agentData;
-}
-#endif
-
-// Returns closest CPU NUMA node to provided GPU
-// NOTE: This assumes HSA GPU indexing is similar to HIP GPU indexing
-int GetClosestNumaNode(int gpuIdx)
-{
-#if defined(__NVCC__)
-  return -1;
-#else
-  AgentData& agentData = GetAgentData();
-  if (gpuIdx < 0 || gpuIdx >= agentData.closestNumaNode.size())
-  {
-    printf("[ERROR] GPU index out is out of bounds\n");
-    exit(1);
-  }
-  return agentData.closestNumaNode[gpuIdx];
-#endif
-}
--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
-/*
-Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#pragma once
-
-#define PackedFloat_t   float4
-#define MAX_BLOCKSIZE   512
-#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
-#define MEMSET_CHAR     75
-#define MEMSET_VAL      13323083.0f
-
-
-#if defined(__NVCC__)
-#define warpSize 32
-#endif
-
-#define MAX_WAVEGROUPS  MAX_BLOCKSIZE / warpSize
-#define MAX_UNROLL      8
-#define NUM_WAVEORDERS  6
-
-// Each subExecutor is provided with subarrays to work on
-#define MAX_SRCS 16
-#define MAX_DSTS 16
-struct SubExecParam
-{
-  // Inputs
-  size_t    N;                                  // Number of floats this subExecutor works on
-  int       numSrcs;                            // Number of source arrays
-  int       numDsts;                            // Number of destination arrays
-  float*    src[MAX_SRCS];                      // Source array pointers
-  float*    dst[MAX_DSTS];                      // Destination array pointers
-  int32_t   preferredXccId;                     // XCC ID to execute on
-
-  // Prepared
-  int       teamSize;                           // Index of this sub executor amongst team
-  int       teamIdx;                            // Size of team this sub executor is part of
-
-  // Outputs
-  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
-  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
-  uint32_t  hwId;                               // Hardware ID
-  uint32_t  xccId;                              // XCC ID
-};
-
-// Macro for collecting HW_REG_HW_ID
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
-#define GetHwId(hwId) \
-  hwId = 0
-#elif defined(__NVCC__)
-#define GetHwId(hwId) \
-  asm("mov.u32 %0, %smid;" : "=r"(hwId) )
-#else
-#define GetHwId(hwId) \
-  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId));
-#endif
-
-// Macro for collecting HW_REG_XCC_ID
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
-#define GetXccId(val) \
-  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
-#else
-#define GetXccId(val) \
-  val = 0
-#endif
-
-void CpuReduceKernel(SubExecParam const& p)
-{
-  int const& numSrcs = p.numSrcs;
-  int const& numDsts = p.numDsts;
-
-  if (numSrcs == 0)
-  {
-    for (int i = 0; i < numDsts; ++i)
-      memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
-  }
-  else if (numSrcs == 1)
-  {
-    float const* __restrict__ src = p.src[0];
-    if (numDsts == 0)
-    {
-      float sum = 0.0;
-      for (int j = 0; j < p.N; j++)
-        sum += p.src[0][j];
-
-      // Add a dummy check to ensure the read is not optimized out
-      if (sum != sum)
-      {
-        printf("[ERROR] Nan detected\n");
-      }
-    }
-    else
-    {
-      for (int i = 0; i < numDsts; ++i)
-      {
-        memcpy(p.dst[i], src, p.N * sizeof(float));
-      }
-    }
-  }
-  else
-  {
-    float sum = 0.0f;
-    for (int j = 0; j < p.N; j++)
-    {
-      sum = p.src[0][j];
-      for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
-      for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
-    }
-  }
-}
-
-std::string PrepSrcValueString()
-{
-  return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
-}
-
-__host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
-{
-  return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
-}
-
-__global__ void CollectXccIdsKernel(int* xccIds)
-{
-  int xccId;
-  GetXccId(xccId);
-  xccIds[blockIdx.x] = xccId;
-}
-
-// GPU kernel to prepare src buffer data
-__global__ void
-PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
-{
-  for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-       idx < N;
-       idx += blockDim.x * gridDim.x)
-  {
-    ptr[idx] = PrepSrcValue(srcBufferIdx, idx);
-  }
-}
-
-__device__ int64_t GetTimestamp()
-{
-#if defined(__NVCC__)
-  int64_t result;
-  asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
-  return result;
-#else
-  return wall_clock64();
-#endif
-}
-
-// Helper function for memset
-template <typename T> __device__ __forceinline__ T      MemsetVal();
-template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
-template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
-
-template <int BLOCKSIZE, int UNROLL>
-__global__ void __launch_bounds__(BLOCKSIZE)
-  GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
-{
-  int64_t startCycle;
-  if (threadIdx.x == 0) startCycle = GetTimestamp();
-
-  SubExecParam& p = params[blockIdx.y];
-
-  // (Experimental) Filter by XCC if desired
-#if !defined(__NVCC__)
-  int32_t xccId;
-  GetXccId(xccId);
-  if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
-#endif
-
-  // Collect data information
-  int32_t const  numSrcs  = p.numSrcs;
-  int32_t const  numDsts  = p.numDsts;
-  float4  const* __restrict__ srcFloat4[MAX_SRCS];
-  float4*        __restrict__ dstFloat4[MAX_DSTS];
-  for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i];
-  for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i];
-
-  // Operate on wavefront granularity
-  int32_t const nTeams   = p.teamSize;             // Number of threadblocks working together on this subarray
-  int32_t const teamIdx  = p.teamIdx;              // Index of this threadblock within the team
-  int32_t const nWaves   = BLOCKSIZE   / warpSize; // Number of wavefronts within this threadblock
-  int32_t const waveIdx  = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
-  int32_t const tIdx     = threadIdx.x % warpSize; // Thread index within wavefront
-
-  size_t  const numFloat4 = p.N / 4;
-
-  int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
-  switch (waveOrder)
-  {
-  case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves;  teamStride2 = nWaves; waveStride2 = 1     ; break;
-  case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams;  teamStride2 = 1;      waveStride2 = nTeams; break;
-  case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL;  teamStride2 = nWaves; waveStride2 = 1     ; break;
-  case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams;  teamStride2 = nWaves; waveStride2 = 1     ; break;
-  case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL;  teamStride2 = 1;      waveStride2 = nTeams; break;
-  case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves;  teamStride2 = 1;      waveStride2 = nTeams; break;
-  }
-
-  int subIterations = 0;
-  while (1) {
-    // First loop: Each wavefront in the team works on UNROLL float4s per thread
-    size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
-    size_t const loop1Limit  = numFloat4 / loop1Stride * loop1Stride;
-    {
-      float4 val[UNROLL];
-      if (numSrcs == 0) {
-        #pragma unroll
-        for (int u = 0; u < UNROLL; u++)
-          val[u] = MemsetVal<float4>();
-      }
-
-      for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
-      {
-        // Read sources into memory and accumulate in registers
-        if (numSrcs)
-        {
-          for (int u = 0; u < UNROLL; u++)
-            val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
-          for (int s = 1; s < numSrcs; s++)
-            for (int u = 0; u < UNROLL; u++)
-              val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
-        }
-
-        // Write accumulation to all outputs
-        for (int d = 0; d < numDsts; d++)
-        {
-          #pragma unroll
-          for (int u = 0; u < UNROLL; u++)
-            dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
-        }
-      }
-    }
-
-    // Second loop: Deal with remaining float4s
-    {
-      if (loop1Limit < numFloat4)
-      {
-        float4 val;
-        if (numSrcs == 0) val = MemsetVal<float4>();
-
-        size_t const loop2Stride = nTeams * nWaves * warpSize;
-        for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
-        {
-          if (numSrcs)
-          {
-            val = srcFloat4[0][idx];
-            for (int s = 1; s < numSrcs; s++)
-              val += srcFloat4[s][idx];
-          }
-
-          for (int d = 0; d < numDsts; d++)
-            dstFloat4[d][idx] = val;
-        }
-      }
-    }
-
-    // Third loop; Deal with remaining floats
-    {
-      if (numFloat4 * 4 < p.N)
-      {
-        float val;
-        if (numSrcs == 0) val = MemsetVal<float>();
-
-        size_t const loop3Stride = nTeams * nWaves * warpSize;
-        for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
-        {
-          if (numSrcs)
-          {
-            val = p.src[0][idx];
-            for (int s = 1; s < numSrcs; s++)
-              val += p.src[s][idx];
-          }
-
-          for (int d = 0; d < numDsts; d++)
-            p.dst[d][idx] = val;
-        }
-      }
-    }
-
-    if (++subIterations == numSubIterations) break;
-  }
-
-  // Wait for all threads to finish
-  __syncthreads();
-  if (threadIdx.x == 0)
-  {
-    __threadfence_system();
-    p.stopCycle  = GetTimestamp();
-    p.startCycle = startCycle;
-    GetHwId(p.hwId);
-    GetXccId(p.xccId);
-  }
-}
-
-typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
-
-#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
-  {GpuReduceKernel<BLOCKSIZE, 1>,  \
-   GpuReduceKernel<BLOCKSIZE, 2>,  \
-   GpuReduceKernel<BLOCKSIZE, 3>,  \
-   GpuReduceKernel<BLOCKSIZE, 4>,  \
-   GpuReduceKernel<BLOCKSIZE, 5>,  \
-   GpuReduceKernel<BLOCKSIZE, 6>,  \
-   GpuReduceKernel<BLOCKSIZE, 7>,  \
-   GpuReduceKernel<BLOCKSIZE, 8>}
-
-GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] =
-{
-  GPU_KERNEL_UNROLL_DECL(64),
-  GPU_KERNEL_UNROLL_DECL(128),
-  GPU_KERNEL_UNROLL_DECL(192),
-  GPU_KERNEL_UNROLL_DECL(256),
-  GPU_KERNEL_UNROLL_DECL(320),
-  GPU_KERNEL_UNROLL_DECL(384),
-  GPU_KERNEL_UNROLL_DECL(448),
-  GPU_KERNEL_UNROLL_DECL(512)
-};
--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
-/*
-Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-#pragma once
-
-#include <vector>
-#include <sstream>
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <cstdint>
-#include <set>
-#include <unistd.h>
-#include <map>
-#include <iostream>
-#include <sstream>
-#include "Compatibility.hpp"
-#include "EnvVars.hpp"
-
-// Simple configuration parameters
-size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer
-
-#define MAX_LINE_LEN 32768
-
-// Different src/dst memory types supported
-typedef enum
-{
-  MEM_CPU          = 0, // Coarse-grained pinned CPU memory
-  MEM_GPU          = 1, // Coarse-grained global GPU memory
-  MEM_CPU_FINE     = 2, // Fine-grained pinned CPU memory
-  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
-  MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
-  MEM_NULL         = 5, // NULL memory - used for empty
-  MEM_MANAGED      = 6
-} MemType;
-
-typedef enum
-{
-  EXE_CPU          = 0, // CPU executor              (subExecutor = CPU thread)
-  EXE_GPU_GFX      = 1, // GPU kernel-based executor (subExecutor = threadblock/CU)
-  EXE_GPU_DMA      = 2, // GPU SDMA-based executor   (subExecutor = streams)
-} ExeType;
-
-bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
-bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
-bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
-bool IsCpuType(ExeType e) { return (e == EXE_CPU); };
-
-char const MemTypeStr[8] = "CGBFUNM";
-char const ExeTypeStr[4] = "CGD";
-char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
-
-MemType inline CharToMemType(char const c)
-{
-  char const* val = strchr(MemTypeStr, toupper(c));
-  if (val) return (MemType)(val - MemTypeStr);
-  printf("[ERROR] Unexpected memory type (%c)\n", c);
-  exit(1);
-}
-
-ExeType inline CharToExeType(char const c)
-{
-  char const* val = strchr(ExeTypeStr, toupper(c));
-  if (val) return (ExeType)(val - ExeTypeStr);
-  printf("[ERROR] Unexpected executor type (%c)\n", c);
-  exit(1);
-}
-
-// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified)
-// then writes the summation to each of the specified destination memory location(s)
-struct Transfer
-{
-  // Inputs
-  ExeType                    exeType;            // Transfer executor type
-  int                        exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
-  int                        exeSubIndex;        // Executor subindex
-  int                        numSubExecs;        // Number of subExecutors to use for this Transfer
-  size_t                     numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
-  int                        numSrcs;            // Number of sources
-  std::vector<MemType>       srcType;            // Source memory types
-  std::vector<int>           srcIndex;           // Source device indice
-  int                        numDsts;            // Number of destinations
-  std::vector<MemType>       dstType;            // Destination memory type
-  std::vector<int>           dstIndex;           // Destination device index
-
-  // Outputs
-  size_t                     numBytesActual;     // Actual number of bytes to copy
-  double                     transferTime;       // Time taken in milliseconds for this transfer
-  double                     transferBandwidth;  // Transfer bandwidth (GB/s)
-  double                     executorBandwidth;  // Executor bandwidth (GB/s)
-  std::vector<double>        perIterationTime;   // Per-iteration timing
-  std::vector<std::set<std::pair<int,int>>> perIterationCUs; // Per-iteration CU usage
-
-  // Internal
-  int                        transferIndex;      // Transfer identifier (within a Test)
-  std::vector<float*>        srcMem;             // Source memory
-  std::vector<float*>        dstMem;             // Destination memory
-  std::vector<SubExecParam>  subExecParam;       // Defines subarrays assigned to each threadblock
-  SubExecParam*              subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
-  std::vector<int>           subExecIdx;         // Indicies into subExecParamGpu
-
-#if !defined(__NVCC__)
-  // For targeted-SDMA
-  hsa_agent_t                dstAgent;           // DMA destination memory agent
-  hsa_agent_t                srcAgent;           // DMA source memory agent
-  hsa_signal_t               signal;             // HSA signal for completion
-  hsa_amd_sdma_engine_id_t   sdmaEngineId;       // DMA engine ID
-#endif
-
-  // Prepares src/dst subarray pointers for each SubExecutor
-  void PrepareSubExecParams(EnvVars const& ev);
-
-  // Prepare source arrays with input data
-  bool PrepareSrc(EnvVars const& ev);
-
-  // Validate that destination data contains expected results
-  void ValidateDst(EnvVars const& ev);
-
-  // Prepare reference buffers
-  void PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx);
-
-  // String representation functions
-  std::string SrcToStr() const;
-  std::string DstToStr() const;
-};
-
-struct ExecutorInfo
-{
-  std::vector<Transfer*>   transfers;        // Transfers to execute
-  size_t                   totalBytes;       // Total bytes this executor transfers
-  int                      totalSubExecs;    // Total number of subExecutors to use
-
-  // For GPU-Executors
-  SubExecParam*            subExecParamGpu;  // GPU copy of subExecutor parameters
-  std::vector<hipStream_t> streams;
-  std::vector<hipEvent_t>  startEvents;
-  std::vector<hipEvent_t>  stopEvents;
-
-  // Results
-  double totalTime;
-};
-
-struct ExeResult
-{
-  double bandwidthGbs;
-  double durationMsec;
-  double sumBandwidthGbs;
-  size_t totalBytes;
-  std::vector<int> transferIdx;
-};
-
-struct TestResults
-{
-  size_t numTimedIterations;
-  size_t totalBytesTransferred;
-  double totalBandwidthCpu;
-  double totalDurationMsec;
-  double overheadMsec;
-  std::map<std::pair<ExeType, int>, ExeResult> exeResults;
-};
-
-typedef std::pair<ExeType, int> Executor;
-typedef std::map<Executor, ExecutorInfo> TransferMap;
-
-// Display usage instructions
-void DisplayUsage(char const* cmdName);
-
-// Display detected GPU topology / CPU numa nodes
-void DisplayTopology(bool const outputToCsv);
-
-// Build array of test sizes based on sampling factor
-void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFactor,
-                       std::vector<size_t>& valuesofN);
-
-void ParseMemType(EnvVars const& ev, std::string const& token, std::vector<MemType>& memType, std::vector<int>& memIndex);
-void ParseExeType(EnvVars const& ev, std::string const& token, ExeType& exeType, int& exeIndex, int& exeSubIndex);
-
-void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transfers);
-
-void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
-                      std::vector<Transfer>& transfers, bool verbose = true,
-                      double* totalBandwidthCpu = nullptr);
-TestResults ExecuteTransfersImpl(EnvVars const& ev, std::vector<Transfer>& transfers);
-void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results);
-void EnablePeerAccess(int const deviceId, int const peerDeviceId);
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
-void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
-void CheckPages(char* byteArray, size_t numBytes, int targetId);
-void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
-void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
-void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int const maxSubExecs);
-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
-void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
-void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
-void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
-void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
-void RunHealthCheck(EnvVars ev);
-
-std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
-
-int RemappedIndex(int const origIdx, bool const isCpuType);
-void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
-std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);