Header-only TransferBench library refactor (#134)

9658305f · gilbertlee-amd · GitHub · b56d4817 · 9658305f · 9658305f
Unverified Commit 9658305f authored Nov 21, 2024 by gilbertlee-amd Committed by GitHub Nov 21, 2024
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ _static/
 _templates/
 _toc.yml
 docBin/
+TransferBench
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,30 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.54
+### Modified
+- Refactored TransferBench into a header-only library combined with a thin client to facilitate the
+  use of TransferBench as the backend for other applications
+- Optimized how data validation is handled - this should speed up Tests with many parallel transfers as data is only
+  generated once
+- Preset benchmarks now no longer take in any extra command line arguments.  Preset settings are only accessed via
+  environment variables.  Details for each preset are printed
+- The a2a preset benchmark now defaults to using fine-grained memory and GFX unroll of 2
+- Refactored how Transfers are launched in parallel which has reduced some CPU-side overheads
+- CPU and DMA executor timing now use CPU wall clock timing instead of slowest Transfer time
+### Added
+- New one2all preset which sweeps over all subests of parallel transfers from one GPU to others
+- Adding new warnings for DMA execution relating to how HIP will default to using agents from the source memory
+### Removed
+- CU scaling preset has been removed.  Similar functionality already exists in the schmoo preset benchmark
+- Preparation of source data via GFX kernel has been removed (USE_PREP_KERNEL)
+- Removed GFX block-reordering (BLOCK_ORDER)
+- Removed NUM_CPU_DEVICES and NUM_GPU_DEVICES from common env vars and only into the presets they apply to.
+- Removed SHARED_MEM_BYTES option for GFX executor
+- Removed USE_PCIE_INDEX, and SHARED_MEM_BYTES
+### Fixed
+- Fixed a potential timing reporting issue when DMA executed Transfers end up getting serialized.
+
 ## v1.53
 ### Added
 - Added ability to specify NULL for sweep preset as source or destination memory type

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 # Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+
 if (DEFINED ENV{ROCM_PATH})
    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
 else()
@@ -6,13 +7,13 @@ else()
 endif()
 cmake_minimum_required(VERSION 3.5)

-project(TransferBench VERSION 1.51.0 LANGUAGES CXX)
-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
+project(TransferBench VERSION 1.54.0 LANGUAGES CXX)
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 --std=c++20 -L${ROCM_PATH}/lib")
 include_directories(${ROCM_PATH}/include)
 link_libraries(numa hsa-runtime64 pthread)
-set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..)
-add_executable(TransferBench src/TransferBench.cpp)
-target_include_directories(TransferBench PRIVATE src/include)
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
+add_executable(TransferBench src/client/Client.cpp)
+target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)

 find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
 include(ROCMInstallTargets)

--- a/Makefile
+++ b/Makefile
 #
-# Copyright (c) 2023      Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
 #

-all:
-	cd src ; make
+# Configuration options
+ROCM_PATH ?= /opt/rocm
+CUDA_PATH ?= /usr/local/cuda
+
+HIPCC=$(ROCM_PATH)/bin/hipcc
+NVCC=$(CUDA_PATH)/bin/nvcc
+
+# Compile TransferBenchCuda if nvcc detected
+ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
+	EXE=TransferBenchCuda
+else
+	EXE=TransferBench
+endif
+
+CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
+NVFLAGS  = -x cu -lnuma -arch=native
+COMMON_FLAGS = -O3 --std=c++20 -I./src/header -I./src/client -I./src/client/Presets
+LDFLAGS += -lpthread
+
+all: $(EXE)
+
+TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
+	$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
+
+TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
+	$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)

 clean:
-	cd src ; make clean
+	rm -f *.o ./TransferBench ./TransferBenchCuda
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -8,8 +8,8 @@ import re

 from rocm_docs import ROCmDocs

-with open('../src/include/EnvVars.hpp', encoding='utf-8') as f:
-    match = re.search(r'#define TB_VERSION "([0-9.]+)[^0-9.]+', f.read())
+with open('../src/header/TransferBench.hpp', encoding='utf-8') as f:
+    match = re.search(r'constexpr char VERSION\[\] = "([0-9.]+)[^0-9.]+', f.read())
    if not match:
        raise ValueError("VERSION not found!")
    version_number = match[1]
@@ -18,7 +18,7 @@ left_nav_title = f"TransferBench {version_number} Documentation"
 # for PDF output on Read the Docs
 project = "TransferBench Documentation"
 author = "Advanced Micro Devices, Inc."
-copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
+copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
 version = version_number
 release = version_number


--- a/docs/install/install.rst
+++ b/docs/install/install.rst
@@ -47,7 +47,7 @@ To build documentation locally, use:
 .. code-block:: bash

  cd docs
-  pip3 install -r .sphinx/requirements.txt
+  pip3 install -r ./sphinx/requirements.txt
  python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html

 NVIDIA platform support

--- a/src/Makefile
+++ b/src/Makefile
-# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
-ROCM_PATH ?= /opt/rocm
-CUDA_PATH ?= /usr/local/cuda
-
-HIPCC=$(ROCM_PATH)/bin/hipcc
-NVCC=$(CUDA_PATH)/bin/nvcc
-
-# Compile TransferBenchCuda if nvcc detected
-ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
-	EXE=../TransferBenchCuda
-else
-	EXE=../TransferBench
-endif
-
-CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
-NVFLAGS = -O3 -Iinclude -x cu -lnuma -arch=native
-LDFLAGS    += -lpthread
-all: $(EXE)
-
-../TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp")
-	$(HIPCC) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-
-../TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp")
-	$(NVCC) $(NVFLAGS) $< -o $@ $(LDFLAGS)
-
-clean:
-	rm -f *.o ../TransferBench ../TransferBenchCuda
--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
-/*
-Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-// This program measures simultaneous copy performance across multiple GPUs
-// on the same node
-#include <numa.h>     // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
-#include <cmath>      // If not found, try installing g++-12      (e.g apt-get install g++-12)
-#include <numaif.h>
-#include <random>
-#include <stack>
-#include <thread>
-
-#include "TransferBench.hpp"
-#include "GetClosestNumaNode.hpp"
-
-int main(int argc, char **argv)
-{
-  // Check for NUMA library support
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
-
-  // Display usage instructions and detected topology
-  if (argc <= 1)
-  {
-    int const outputToCsv = EnvVars::GetEnvVar("OUTPUT_TO_CSV", 0);
-    if (!outputToCsv) DisplayUsage(argv[0]);
-    DisplayTopology(outputToCsv);
-    exit(0);
-  }
-
-  // Collect environment variables / display current run configuration
-  EnvVars ev;
-
-  // Determine number of bytes to run per Transfer
-  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
-  if (argc > 2)
-  {
-    // Adjust bytes if unit specified
-    char units = argv[2][strlen(argv[2])-1];
-    switch (units)
-    {
-    case 'K': case 'k': numBytesPerTransfer *= 1024; break;
-    case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
-    case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
-    }
-  }
-  if (numBytesPerTransfer % 4)
-  {
-    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
-    exit(1);
-  }
-
-  // Check for preset tests
-  // - Tests that sweep across possible sets of Transfers
-  if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
-  {
-    int numGpuSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
-    int numCpuSubExecs = (argc > 4 ? atoi(argv[4]) : 4);
-
-    ev.configMode = CFG_SWEEP;
-    RunSweepPreset(ev, numBytesPerTransfer, numGpuSubExecs, numCpuSubExecs, !strcmp(argv[1], "rsweep"));
-    exit(0);
-  }
-  // - Tests that benchmark peer-to-peer performance
-  else if (!strcmp(argv[1], "p2p"))
-  {
-    ev.configMode = CFG_P2P;
-    RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float));
-    exit(0);
-  }
-  // - Test SubExecutor scaling
-  else if (!strcmp(argv[1], "scaling"))
-  {
-    int maxSubExecs = (argc > 3 ? atoi(argv[3]) : 32);
-    int exeIndex    = (argc > 4 ? atoi(argv[4]) : 0);
-
-    if (exeIndex >= ev.numGpuDevices)
-    {
-      printf("[ERROR] Cannot execute scaling test with GPU device %d\n", exeIndex);
-      exit(1);
-    }
-    ev.configMode = CFG_SCALE;
-    RunScalingBenchmark(ev, numBytesPerTransfer / sizeof(float), exeIndex, maxSubExecs);
-    exit(0);
-  }
-  // - Test all2all benchmark
-  else if (!strcmp(argv[1], "a2a"))
-  {
-    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
-
-    // Force single-stream mode for all-to-all benchmark
-    ev.useSingleStream = 1;
-    ev.configMode = CFG_A2A;
-    RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
-    exit(0);
-  }
-  // Health check
-  else if (!strcmp(argv[1], "healthcheck")) {
-    RunHealthCheck(ev);
-    exit(0);
-  }
-  // - Test schmoo benchmark
-  else if (!strcmp(argv[1], "schmoo"))
-  {
-    if (ev.numGpuDevices < 2)
-    {
-      printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
-      exit(1);
-    }
-    ev.configMode = CFG_SCHMOO;
-
-    int localIdx    = (argc > 3 ? atoi(argv[3]) : 0);
-    int remoteIdx   = (argc > 4 ? atoi(argv[4]) : 1);
-    int maxSubExecs = (argc > 5 ? atoi(argv[5]) : 32);
-
-    if (localIdx >= ev.numGpuDevices || remoteIdx >= ev.numGpuDevices)
-    {
-      printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
-      exit(1);
-    }
-    ev.DisplaySchmooEnvVars();
-
-    for (int N = 256; N <= (1<<27); N *= 2)
-    {
-      int delta = std::max(1, N / ev.samplingFactor);
-      int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
-      do
-      {
-        RunSchmooBenchmark(ev, curr * sizeof(float), localIdx, remoteIdx, maxSubExecs);
-        if (numBytesPerTransfer != 0) exit(0);
-        curr += delta;
-      } while (curr < N * 2);
-    }
-  }
-  else if (!strcmp(argv[1], "rwrite"))
-  {
-    if (ev.numGpuDevices < 2)
-    {
-      printf("[ERROR] Remote write benchmark requires at least 2 GPUs\n");
-      exit(1);
-    }
-    ev.DisplayRemoteWriteEnvVars();
-
-    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
-    int srcIdx      = (argc > 4 ? atoi(argv[4]) : 0);
-    int minGpus     = (argc > 5 ? atoi(argv[5]) : 1);
-    int maxGpus     = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
-
-    for (int N = 256; N <= (1<<27); N *= 2)
-    {
-      int delta = std::max(1, N / ev.samplingFactor);
-      int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
-      do
-      {
-        RunRemoteWriteBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
-        if (numBytesPerTransfer != 0) exit(0);
-        curr += delta;
-      } while (curr < N * 2);
-    }
-  }
-  else if (!strcmp(argv[1], "pcopy"))
-  {
-    if (ev.numGpuDevices < 2)
-    {
-      printf("[ERROR] Parallel copy benchmark requires at least 2 GPUs\n");
-      exit(1);
-    }
-    ev.DisplayParallelCopyEnvVars();
-
-    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8);
-    int srcIdx      = (argc > 4 ? atoi(argv[4]) : 0);
-    int minGpus     = (argc > 5 ? atoi(argv[5]) : 1);
-    int maxGpus     = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
-
-    if (maxGpus > ev.gpuMaxHwQueues && ev.useDmaCopy)
-    {
-      printf("[ERROR] DMA executor %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
-             srcIdx, maxGpus, ev.gpuMaxHwQueues);
-      printf("[ERROR] Aborting to avoid misleading results due to potential serialization of Transfers\n");
-      exit(1);
-    }
-
-    for (int N = 256; N <= (1<<27); N *= 2)
-    {
-      int delta = std::max(1, N / ev.samplingFactor);
-      int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
-      do
-      {
-        RunParallelCopyBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
-        if (numBytesPerTransfer != 0) exit(0);
-        curr += delta;
-      } while (curr < N * 2);
-    }
-  }
-  else if (!strcmp(argv[1], "cmdline"))
-  {
-    // Print environment variables
-    ev.DisplayEnvVars();
-
-    // Read Transfer from command line
-    std::string cmdlineTransfer;
-    for (int i = 3; i < argc; i++)
-      cmdlineTransfer += std::string(argv[i]) + " ";
-
-    char line[MAX_LINE_LEN];
-    sprintf(line, "%s", cmdlineTransfer.c_str());
-    std::vector<Transfer> transfers;
-    ParseTransfers(ev, line, transfers);
-    if (transfers.empty()) exit(0);
-
-    // If the number of bytes is specified, use it
-    if (numBytesPerTransfer != 0)
-    {
-      size_t N = numBytesPerTransfer / sizeof(float);
-      ExecuteTransfers(ev, 1, N, transfers);
-    }
-    else
-    {
-      // Otherwise generate a range of values
-      for (int N = 256; N <= (1<<27); N *= 2)
-      {
-        int delta = std::max(1, N / ev.samplingFactor);
-        int curr = N;
-        while (curr < N * 2)
-        {
-          ExecuteTransfers(ev, 1, curr, transfers);
-          curr += delta;
-        }
-      }
-    }
-    exit(0);
-  }
-
-  // Check that Transfer configuration file can be opened
-  ev.configMode = CFG_FILE;
-  FILE* fp = fopen(argv[1], "r");
-  if (!fp)
-  {
-    printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
-    exit(1);
-  }
-
-  // Print environment variables and CSV header
-  ev.DisplayEnvVars();
-
-  int testNum = 0;
-  char line[MAX_LINE_LEN];
-  while(fgets(line, MAX_LINE_LEN, fp))
-  {
-    // Check if line is a comment to be echoed to output (starts with ##)
-    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);
-
-    // Parse set of parallel Transfers to execute
-    std::vector<Transfer> transfers;
-    ParseTransfers(ev, line, transfers);
-    if (transfers.empty()) continue;
-
-    // If the number of bytes is specified, use it
-    if (numBytesPerTransfer != 0)
-    {
-      size_t N = numBytesPerTransfer / sizeof(float);
-      ExecuteTransfers(ev, ++testNum, N, transfers);
-    }
-    else
-    {
-      // Otherwise generate a range of values
-      for (int N = 256; N <= (1<<27); N *= 2)
-      {
-        int delta = std::max(1, N / ev.samplingFactor);
-        int curr = N;
-        while (curr < N * 2)
-        {
-          ExecuteTransfers(ev, ++testNum, curr, transfers);
-          curr += delta;
-        }
-      }
-    }
-  }
-  fclose(fp);
-
-  return 0;
-}
-
-void ExecuteTransfers(EnvVars const& ev,
-                      int const testNum,
-                      size_t const N,
-                      std::vector<Transfer>& transfers,
-                      bool verbose,
-                      double* totalBandwidthCpu)
-{
-  // Check for any Transfers using variable number of sub-executors
-  std::vector<int> varTransfers;
-  std::vector<int> numUsedSubExec(ev.numGpuDevices, 0);
-  std::vector<int> numVarSubExec(ev.numGpuDevices, 0);
-
-  for (int i = 0; i < transfers.size(); i++) {
-    Transfer& t = transfers[i];
-    t.transferIndex = i;
-    t.numBytesActual = (t.numBytes ? t.numBytes : N * sizeof(float));
-
-    if (t.exeType == EXE_GPU_GFX) {
-      if (t.numSubExecs == 0) {
-        varTransfers.push_back(i);
-        numVarSubExec[t.exeIndex]++;
-      } else {
-        numUsedSubExec[t.exeIndex] += t.numSubExecs;
-      }
-    } else if (t.numSubExecs == 0) {
-      printf("[ERROR] Variable subexecutor count is only supported for GFX executor\n");
-      exit(1);
-    }
-  }
-
-  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
-
-  TestResults testResults;
-  if (varTransfers.size() == 0) {
-    testResults = ExecuteTransfersImpl(ev, transfers);
-  } else {
-    // Determine maximum number of subexecutors
-    int maxNumSubExec = 0;
-    if (ev.maxNumVarSubExec) {
-      maxNumSubExec = ev.maxNumVarSubExec;
-    } else {
-      HIP_CALL(hipDeviceGetAttribute(&maxNumSubExec, hipDeviceAttributeMultiprocessorCount, 0));
-      for (int device = 0; device < ev.numGpuDevices; device++) {
-        int numSubExec = 0;
-        HIP_CALL(hipDeviceGetAttribute(&numSubExec, hipDeviceAttributeMultiprocessorCount, device));
-        int leftOverSubExec = numSubExec - numUsedSubExec[device];
-        if (leftOverSubExec < numVarSubExec[device])
-          maxNumSubExec = 1;
-        else if (numVarSubExec[device] != 0) {
-          maxNumSubExec = std::min(maxNumSubExec, leftOverSubExec / numVarSubExec[device]);
-        }
-      }
-    }
-
-    // Loop over subexecs
-    std::vector<Transfer> bestTransfers;
-    for (int numSubExec = ev.minNumVarSubExec; numSubExec <= maxNumSubExec; numSubExec++) {
-      std::vector<Transfer> currTransfers = transfers;
-      for (auto idx : varTransfers) {
-        currTransfers[idx].numSubExecs = numSubExec;
-      }
-      TestResults tempResults = ExecuteTransfersImpl(ev, currTransfers);
-      if (tempResults.totalBandwidthCpu > testResults.totalBandwidthCpu) {
-        bestTransfers = currTransfers;
-        testResults = tempResults;
-      }
-    }
-    transfers = bestTransfers;
-  }
-  if (totalBandwidthCpu) *totalBandwidthCpu = testResults.totalBandwidthCpu;
-
-  if (verbose) {
-    ReportResults(ev, transfers, testResults);
-  }
-}
-
-TestResults ExecuteTransfersImpl(EnvVars const& ev,
-                                 std::vector<Transfer>& transfers)
-{
-  // Map transfers by executor
-  TransferMap transferMap;
-  for (int i = 0; i < transfers.size(); i++)
-  {
-    Transfer& transfer = transfers[i];
-    transfer.transferIndex = i;
-    Executor executor(transfer.exeType, transfer.exeIndex);
-    ExecutorInfo& executorInfo = transferMap[executor];
-    executorInfo.transfers.push_back(&transfer);
-  }
-
-  // Loop over each executor and prepare sub-executors
-  std::map<int, Transfer*> transferList;
-  for (auto& exeInfoPair : transferMap)
-  {
-    Executor const& executor = exeInfoPair.first;
-    ExecutorInfo& exeInfo    = exeInfoPair.second;
-    ExeType const exeType    = executor.first;
-    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
-
-    exeInfo.totalTime = 0.0;
-    exeInfo.totalSubExecs = 0;
-
-    // Loop over each transfer this executor is involved in
-    for (Transfer* transfer : exeInfo.transfers)
-    {
-      // Allocate source memory
-      transfer->srcMem.resize(transfer->numSrcs);
-      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
-      {
-        MemType const& srcType  = transfer->srcType[iSrc];
-        int     const  srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
-
-        // Ensure executing GPU can access source memory
-        if (IsGpuType(exeType) && IsGpuType(srcType) && srcIndex != exeIndex)
-          EnablePeerAccess(exeIndex, srcIndex);
-
-        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
-      }
-
-      // Allocate destination memory
-      transfer->dstMem.resize(transfer->numDsts);
-      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
-      {
-        MemType const& dstType  = transfer->dstType[iDst];
-        int     const  dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
-
-        // Ensure executing GPU can access destination memory
-        if (IsGpuType(exeType) && IsGpuType(dstType) && dstIndex != exeIndex)
-          EnablePeerAccess(exeIndex, dstIndex);
-
-        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
-      }
-
-      exeInfo.totalSubExecs += transfer->numSubExecs;
-      transferList[transfer->transferIndex] = transfer;
-    }
-
-    // Prepare additional requirement for GPU-based executors
-    if (IsGpuType(exeType))
-    {
-      HIP_CALL(hipSetDevice(exeIndex));
-
-      // Single-stream is only supported for GFX-based executors
-      int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
-      exeInfo.streams.resize(numStreamsToUse);
-      exeInfo.startEvents.resize(numStreamsToUse);
-      exeInfo.stopEvents.resize(numStreamsToUse);
-      for (int i = 0; i < numStreamsToUse; ++i)
-      {
-        if (ev.cuMask.size())
-        {
-#if !defined(__NVCC__)
-          HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data()));
-#else
-          printf("[ERROR] CU Masking in not supported on NVIDIA hardware\n");
-          exit(-1);
-#endif
-        }
-        else
-        {
-          HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
-        }
-        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
-        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
-      }
-
-      if (exeType == EXE_GPU_GFX)
-      {
-        // Allocate one contiguous chunk of GPU memory for threadblock parameters
-        // This allows support for executing one transfer per stream, or all transfers in a single stream
-#if !defined(__NVCC__)
-        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
-                       (void**)&exeInfo.subExecParamGpu);
-#else
-        AllocateMemory(MEM_MANAGED, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
-                       (void**)&exeInfo.subExecParamGpu);
-#endif
-
-        // Check for sufficient subExecutors
-        int numDeviceCUs = 0;
-        HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex));
-        if (exeInfo.totalSubExecs > numDeviceCUs)
-        {
-          printf("[WARN] GFX executor %d requesting %d total subexecutors, however only has %d.  Some Transfers may be serialized\n",
-                 exeIndex, exeInfo.totalSubExecs, numDeviceCUs);
-        }
-      }
-
-      // Check for targeted DMA
-      if (exeType == EXE_GPU_DMA)
-      {
-        bool useRandomDma = false;
-        bool useTargetDma = false;
-
-        // Check for sufficient hardware queues
-#if !defined(__NVCC_)
-        if (exeInfo.transfers.size() > ev.gpuMaxHwQueues)
-        {
-          printf("[WARN] DMA executor %d attempting %lu parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
-                 exeIndex, exeInfo.transfers.size(), ev.gpuMaxHwQueues);
-        }
-#endif
-
-        for (Transfer* transfer : exeInfo.transfers)
-        {
-          if (transfer->exeSubIndex != -1 || ev.useHsaDma)
-          {
-            useTargetDma = (transfer->exeSubIndex != -1);
-
-#if defined(__NVCC__)
-            printf("[ERROR] DMA executor subindex not supported on NVIDIA hardware\n");
-            exit(-1);
-#else
-            if (transfer->numSrcs != 1 || transfer->numDsts != 1)
-            {
-              printf("[ERROR] DMA Transfer must have at exactly one source and one destination");
-              exit(1);
-            }
-
-            // Collect HSA agent information
-
-            hsa_amd_pointer_info_t info;
-            info.size = sizeof(info);
-
-            HSA_CHECK(hsa_amd_pointer_info(transfer->dstMem[0], &info, NULL, NULL, NULL));
-            transfer->dstAgent = info.agentOwner;
-
-            HSA_CHECK(hsa_amd_pointer_info(transfer->srcMem[0], &info, NULL, NULL, NULL));
-            transfer->srcAgent = info.agentOwner;
-
-            // Create HSA completion signal
-            HSA_CHECK(hsa_signal_create(1, 0, NULL, &transfer->signal));
-
-            // Check for valid engine Id
-            if (transfer->exeSubIndex < -1 || transfer->exeSubIndex >= 32)
-            {
-              printf("[ERROR] DMA executor subindex must be between 0 and 31\n");
-              exit(1);
-            }
-
-            // Check that engine Id exists between agents
-            if (useTargetDma) {
-              uint32_t engineIdMask = 0;
-              HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent,
-                                                          transfer->srcAgent,
-                                                          &engineIdMask));
-              transfer->sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << transfer->exeSubIndex);
-              if (!(transfer->sdmaEngineId & engineIdMask))
-              {
-                printf("[ERROR] DMA executor %d.%d does not exist or cannot copy between source %s to destination %s\n",
-                       transfer->exeIndex, transfer->exeSubIndex,
-                       transfer->SrcToStr().c_str(),
-                       transfer->DstToStr().c_str());
-                exit(1);
-              }
-            }
-#endif
-          }
-          else
-          {
-            useRandomDma = true;
-          }
-        }
-        if (useRandomDma && useTargetDma)
-        {
-          printf("[WARN] Mixing targeted and untargetted DMA execution on GPU %d may result in resource conflicts\n",
-            exeIndex);
-        }
-      }
-    }
-  }
-
-
-  // Prepare input memory and block parameters for current N
-  bool isSrcCorrect = true;
-  for (auto& exeInfoPair : transferMap)
-  {
-    Executor const& executor = exeInfoPair.first;
-    ExecutorInfo& exeInfo    = exeInfoPair.second;
-    ExeType const exeType    = executor.first;
-    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
-
-    exeInfo.totalBytes = 0;
-    for (int i = 0; i < exeInfo.transfers.size(); ++i)
-    {
-      // Prepare subarrays each threadblock works on and fill src memory with patterned data
-      Transfer* transfer = exeInfo.transfers[i];
-      transfer->PrepareSubExecParams(ev);
-      isSrcCorrect &= transfer->PrepareSrc(ev);
-      exeInfo.totalBytes += transfer->numBytesActual;
-    }
-
-    // Copy block parameters to GPU for GPU executors
-    if (exeType == EXE_GPU_GFX)
-    {
-      std::vector<SubExecParam> tempSubExecParam;
-
-      if (!ev.useSingleStream || (ev.blockOrder == ORDER_SEQUENTIAL))
-      {
-        // Assign Transfers to sequentual threadblocks
-        int transferOffset = 0;
-        for (Transfer* transfer : exeInfo.transfers)
-        {
-          transfer->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
-
-          transfer->subExecIdx.clear();
-          for (int subExecIdx = 0; subExecIdx < transfer->subExecParam.size(); subExecIdx++)
-          {
-            transfer->subExecIdx.push_back(transferOffset + subExecIdx);
-            tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
-          }
-          transferOffset += transfer->numSubExecs;
-        }
-      }
-      else if (ev.blockOrder == ORDER_INTERLEAVED)
-      {
-        // Interleave threadblocks of different Transfers
-        exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
-        for (int subExecIdx = 0; tempSubExecParam.size() < exeInfo.totalSubExecs; ++subExecIdx)
-        {
-          for (Transfer* transfer : exeInfo.transfers)
-          {
-            if (subExecIdx < transfer->numSubExecs)
-            {
-              transfer->subExecIdx.push_back(tempSubExecParam.size());
-              tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
-            }
-          }
-        }
-      }
-      else if (ev.blockOrder == ORDER_RANDOM)
-      {
-        std::vector<std::pair<int,int>> indices;
-        exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
-
-        // Build up a list of (transfer,subExecParam) indices, then randomly sort them
-        for (int i = 0; i < exeInfo.transfers.size(); i++)
-        {
-          Transfer* transfer = exeInfo.transfers[i];
-          for (int subExecIdx = 0; subExecIdx < transfer->numSubExecs; subExecIdx++)
-            indices.push_back(std::make_pair(i, subExecIdx));
-        }
-        std::shuffle(indices.begin(), indices.end(), *ev.generator);
-
-        // Build randomized threadblock list
-        for (auto p : indices)
-        {
-          Transfer* transfer = exeInfo.transfers[p.first];
-          transfer->subExecIdx.push_back(tempSubExecParam.size());
-          tempSubExecParam.push_back(transfer->subExecParam[p.second]);
-        }
-      }
-
-      HIP_CALL(hipSetDevice(exeIndex));
-      HIP_CALL(hipMemcpy(exeInfo.subExecParamGpu,
-                         tempSubExecParam.data(),
-                         tempSubExecParam.size() * sizeof(SubExecParam),
-                         hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-    }
-  }
-
-  // Launch kernels (warmup iterations are not counted)
-  double totalCpuTime = 0;
-  size_t numTimedIterations = 0;
-  std::stack<std::thread> threads;
-  for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
-  {
-    if (ev.numIterations > 0 && iteration    >= ev.numIterations) break;
-    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
-
-    // Pause before starting first timed iteration in interactive mode
-    if (ev.useInteractive && iteration == 0)
-    {
-      printf("Memory prepared:\n");
-
-      for (Transfer& transfer : transfers)
-      {
-        printf("Transfer %03d:\n", transfer.transferIndex);
-        for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
-          printf("  SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
-        for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
-          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
-      }
-      printf("Hit <Enter> to continue: ");
-      if (scanf("%*c") != 0)
-      {
-        printf("[ERROR] Unexpected input\n");
-        exit(1);
-      }
-      printf("\n");
-    }
-
-    // Start CPU timing for this iteration
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-
-    // Execute all Transfers in parallel
-    for (auto& exeInfoPair : transferMap)
-    {
-      ExecutorInfo& exeInfo = exeInfoPair.second;
-      ExeType       exeType = exeInfoPair.first.first;
-      int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();
-
-      for (int i = 0; i < numTransfersToRun; ++i)
-        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
-    }
-
-    // Wait for all threads to finish
-    int const numTransfers = threads.size();
-    for (int i = 0; i < numTransfers; i++)
-    {
-      threads.top().join();
-      threads.pop();
-    }
-
-    // Stop CPU timing for this iteration
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-
-    if (ev.alwaysValidate)
-    {
-      for (auto transferPair : transferList)
-      {
-        Transfer* transfer = transferPair.second;
-        transfer->ValidateDst(ev);
-      }
-    }
-
-    if (iteration >= 0)
-    {
-      ++numTimedIterations;
-      totalCpuTime += deltaSec;
-    }
-  }
-
-  // Pause for interactive mode
-  if (isSrcCorrect && ev.useInteractive)
-  {
-    printf("Transfers complete. Hit <Enter> to continue: ");
-    if (scanf("%*c") != 0)
-    {
-      printf("[ERROR] Unexpected input\n");
-      exit(1);
-    }
-    printf("\n");
-  }
-
-  // Validate that each transfer has transferred correctly
-  size_t totalBytesTransferred = 0;
-  int const numTransfers = transferList.size();
-  for (auto transferPair : transferList)
-  {
-    Transfer* transfer = transferPair.second;
-    transfer->ValidateDst(ev);
-    totalBytesTransferred += transfer->numBytesActual;
-  }
-
-  // Record results
-  TestResults testResults;
-  testResults.numTimedIterations = numTimedIterations;
-  testResults.totalBytesTransferred = totalBytesTransferred;
-  testResults.totalDurationMsec = totalCpuTime / (1.0 * numTimedIterations * ev.numSubIterations) * 1000;
-  testResults.totalBandwidthCpu = (totalBytesTransferred / 1.0E6) / testResults.totalDurationMsec;
-
-  double maxExeDurationMsec = 0.0;
-  if (!isSrcCorrect) goto cleanup;
-
-  for (auto& exeInfoPair : transferMap)
-  {
-    ExecutorInfo  exeInfo  = exeInfoPair.second;
-    ExeType const exeType  = exeInfoPair.first.first;
-    int     const exeIndex = exeInfoPair.first.second;
-    ExeResult& exeResult   = testResults.exeResults[std::make_pair(exeType, exeIndex)];
-
-    // Compute total time for non GPU executors
-    if (exeType != EXE_GPU_GFX || ev.useSingleStream == 0)
-    {
-      exeInfo.totalTime = 0;
-      for (auto const& transfer : exeInfo.transfers)
-        exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
-    }
-
-    exeResult.totalBytes      = exeInfo.totalBytes;
-    exeResult.durationMsec    = exeInfo.totalTime / (1.0 * numTimedIterations * ev.numSubIterations);
-    exeResult.bandwidthGbs    = (exeInfo.totalBytes / 1.0E9) / exeResult.durationMsec * 1000.0f;
-    exeResult.sumBandwidthGbs = 0;
-    maxExeDurationMsec        = std::max(maxExeDurationMsec, exeResult.durationMsec);
-
-    for (auto& transfer: exeInfo.transfers)
-    {
-      exeResult.transferIdx.push_back(transfer->transferIndex);
-      transfer->transferTime /= (1.0 * numTimedIterations * ev.numSubIterations);
-      transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
-      transfer->executorBandwidth = exeResult.bandwidthGbs;
-      exeResult.sumBandwidthGbs += transfer->transferBandwidth;
-    }
-  }
-  testResults.overheadMsec = testResults.totalDurationMsec - maxExeDurationMsec;
-
-  // Release GPU memory
-cleanup:
-  for (auto exeInfoPair : transferMap)
-  {
-    ExecutorInfo& exeInfo  = exeInfoPair.second;
-    ExeType const exeType  = exeInfoPair.first.first;
-    int     const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));
-
-    for (auto& transfer : exeInfo.transfers)
-    {
-      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
-      {
-        MemType const& srcType = transfer->srcType[iSrc];
-        DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
-      }
-      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
-      {
-        MemType const& dstType = transfer->dstType[iDst];
-        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
-      }
-      transfer->subExecParam.clear();
-
-      if (exeType == EXE_GPU_DMA && (transfer->exeSubIndex != -1 || ev.useHsaDma))
-      {
-#if !defined(__NVCC__)
-        HSA_CHECK(hsa_signal_destroy(transfer->signal));
-#endif
-      }
-    }
-
-    if (IsGpuType(exeType))
-    {
-      int const numStreams = (int)exeInfo.streams.size();
-      for (int i = 0; i < numStreams; ++i)
-      {
-        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
-        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
-        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
-      }
-
-      if (exeType == EXE_GPU_GFX)
-      {
-#if !defined(__NVCC__)
-        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
-#else
-        DeallocateMemory(MEM_MANAGED, exeInfo.subExecParamGpu);
-#endif
-      }
-    }
-  }
-
-  return testResults;
-}
-
-void DisplayUsage(char const* cmdName)
-{
-  printf("TransferBench v%s\n", TB_VERSION);
-  printf("========================================\n");
-
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  int const numCpuDevices = numa_num_configured_nodes();
-
-  printf("Usage: %s config <N>\n", cmdName);
-  printf("  config: Either:\n");
-  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
-  printf("          - Name of preset config:\n");
-  printf("              a2a          - GPU All-To-All benchmark\n");
-  printf("                             - 3rd optional arg: # of SubExecs to use\n");
-  printf("              cmdline      - Read Transfers from command line arguments (after N)\n");
-  printf("              healthcheck  - Simple bandwidth health check (MI300 series only)\n");
-  printf("              p2p          - Peer-to-peer benchmark tests\n");
-  printf("              rwrite/pcopy - Parallel writes/copies from single GPU to other GPUs\n");
-  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
-  printf("                             - 4th optional arg: Root GPU index\n");
-  printf("                             - 5th optional arg: Min number of other GPUs to transfer to\n");
-  printf("                             - 6th optional arg: Max number of other GPUs to transfer to\n");
-  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
-  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
-  printf("                             - 4th optional arg: # CPU SubExecs per Transfer\n");
-  printf("              scaling      - GPU GFX SubExec scaling copy test\n");
-  printf("                             - 3th optional arg: Max # of SubExecs to use\n");
-  printf("                             - 4rd optional arg: GPU index to use as executor\n");
-  printf("              schmoo       - Local/RemoteRead/Write/Copy between two GPUs\n");
-  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
-  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
-         DEFAULT_BYTES_PER_TRANSFER);
-  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
-  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
-  printf("\n");
-
-  EnvVars::DisplayUsage();
-}
-
-int RemappedIndex(int const origIdx, bool const isCpuType)
-{
-  static std::vector<int> remappingCpu;
-  static std::vector<int> remappingGpu;
-
-  // Build CPU remapping on first use
-  // Skip numa nodes that are not configured
-  if (remappingCpu.empty())
-  {
-    for (int node = 0; node <= numa_max_node(); node++)
-      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
-        remappingCpu.push_back(node);
-  }
-
-  // Build remappingGpu on first use
-  if (remappingGpu.empty())
-  {
-    int numGpuDevices;
-    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-    remappingGpu.resize(numGpuDevices);
-
-    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
-    if (!usePcieIndexing)
-    {
-      // For HIP-based indexing no remappingGpu is necessary
-      for (int i = 0; i < numGpuDevices; ++i)
-        remappingGpu[i] = i;
-    }
-    else
-    {
-      // Collect PCIe address for each GPU
-      std::vector<std::pair<std::string, int>> mapping;
-      char pciBusId[20];
-      for (int i = 0; i < numGpuDevices; ++i)
-      {
-        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
-        mapping.push_back(std::make_pair(pciBusId, i));
-      }
-      // Sort GPUs by PCIe address then use that as mapping
-      std::sort(mapping.begin(), mapping.end());
-      for (int i = 0; i < numGpuDevices; ++i)
-        remappingGpu[i] = mapping[i].second;
-    }
-  }
-  return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
-}
-
-void DisplayTopology(bool const outputToCsv)
-{
-
-  int numCpuDevices = numa_num_configured_nodes();
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-
-  if (outputToCsv)
-  {
-    printf("NumCpus,%d\n", numCpuDevices);
-    printf("NumGpus,%d\n", numGpuDevices);
-  }
-  else
-  {
-    printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total]   %d GPU device(s)\n",
-           numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
-  }
-
-  // Print out detected CPU topology
-  if (outputToCsv)
-  {
-    printf("NUMA");
-    for (int j = 0; j < numCpuDevices; j++)
-      printf(",NUMA%02d", j);
-    printf(",# CPUs,ClosestGPUs,ActualNode\n");
-  }
-  else
-  {
-    printf("            |");
-    for (int j = 0; j < numCpuDevices; j++)
-      printf("NUMA %02d|", j);
-    printf(" #Cpus | Closest GPU(s)\n");
-
-    printf("------------+");
-    for (int j = 0; j <= numCpuDevices; j++)
-      printf("-------+");
-    printf("---------------\n");
-  }
-
-  for (int i = 0; i < numCpuDevices; i++)
-  {
-    int nodeI = RemappedIndex(i, true);
-    printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
-    for (int j = 0; j < numCpuDevices; j++)
-    {
-      int nodeJ = RemappedIndex(j, true);
-      int numaDist = numa_distance(nodeI, nodeJ);
-      if (outputToCsv)
-        printf("%d,", numaDist);
-      else
-        printf(" %5d |", numaDist);
-    }
-
-    int numCpus = 0;
-    for (int j = 0; j < numa_num_configured_cpus(); j++)
-      if (numa_node_of_cpu(j) == nodeI) numCpus++;
-    if (outputToCsv)
-      printf("%d,", numCpus);
-    else
-      printf(" %5d | ", numCpus);
-
-#if !defined(__NVCC__)
-    bool isFirst = true;
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
-      {
-        if (isFirst) isFirst = false;
-        else printf(",");
-        printf("%d", j);
-      }
-    }
-#endif
-    printf("\n");
-  }
-  printf("\n");
-
-#if defined(__NVCC__)
-
-  for (int i = 0; i < numGpuDevices; i++)
-  {
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, i));
-    printf(" GPU %02d | %s\n", i, prop.name);
-  }
-
-  // No further topology detection done for NVIDIA platforms
-  return;
-#else
-    // Figure out DMA engines per GPU
-  std::vector<std::set<int>> dmaEngineIdsPerDevice(numGpuDevices);
-  {
-    std::vector<hsa_agent_t> gpuAgentList;
-    std::vector<hsa_agent_t> allAgentList;
-    hsa_amd_pointer_info_t info;
-    info.size = sizeof(info);
-
-    for (int deviceId = 0; deviceId < numGpuDevices; deviceId++)
-    {
-      HIP_CALL(hipSetDevice(deviceId));
-      int32_t* tempGpuBuffer;
-      HIP_CALL(hipMalloc((void**)&tempGpuBuffer, 1024));
-      HSA_CHECK(hsa_amd_pointer_info(tempGpuBuffer, &info, NULL, NULL, NULL));
-      gpuAgentList.push_back(info.agentOwner);
-      allAgentList.push_back(info.agentOwner);
-      HIP_CALL(hipFree(tempGpuBuffer));
-    }
-    for (int deviceId = 0; deviceId < numCpuDevices; deviceId++)
-    {
-      int32_t* tempCpuBuffer;
-      AllocateMemory(MEM_CPU, deviceId, 1024, (void**)&tempCpuBuffer);
-      HSA_CHECK(hsa_amd_pointer_info(tempCpuBuffer, &info, NULL, NULL, NULL));
-      allAgentList.push_back(info.agentOwner);
-      DeallocateMemory(MEM_CPU, tempCpuBuffer, 1024);
-    }
-
-    for (int srcDevice = 0; srcDevice < numGpuDevices; srcDevice++)
-    {
-      dmaEngineIdsPerDevice[srcDevice].clear();
-      for (int dstDevice = 0; dstDevice < allAgentList.size(); dstDevice++)
-      {
-        if (srcDevice == dstDevice) continue;
-        uint32_t engineIdMask = 0;
-        if (hsa_amd_memory_copy_engine_status(allAgentList[dstDevice],
-                                              gpuAgentList[srcDevice],
-                                              &engineIdMask) != HSA_STATUS_SUCCESS)
-          continue;
-        for (int engineId = 0; engineId < 32; engineId++)
-        {
-          if (engineIdMask & (1U << engineId))
-            dmaEngineIdsPerDevice[srcDevice].insert(engineId);
-        }
-      }
-    }
-  }
-
-  // Print out detected GPU topology
-  if (outputToCsv)
-  {
-    printf("GPU");
-    for (int j = 0; j < numGpuDevices; j++)
-      printf(",GPU %02d", j);
-    printf(",PCIe Bus ID,ClosestNUMA,DMA engines\n");
-  }
-  else
-  {
-    printf("        |");
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, j));
-      std::string fullName = prop.gcnArchName;
-      std::string archName = fullName.substr(0, fullName.find(':'));
-      printf(" %6s |", archName.c_str());
-    }
-    printf("\n");
-    printf("        |");
-    for (int j = 0; j < numGpuDevices; j++)
-      printf(" GPU %02d |", j);
-    printf(" PCIe Bus ID  | #CUs | Closest NUMA | DMA engines\n");
-    for (int j = 0; j <= numGpuDevices; j++)
-      printf("--------+");
-    printf("--------------+------+-------------+------------\n");
-  }
-
-  char pciBusId[20];
-  for (int i = 0; i < numGpuDevices; i++)
-  {
-    int const deviceIdx = RemappedIndex(i, false);
-    printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      if (i == j)
-      {
-        if (outputToCsv)
-          printf("-,");
-        else
-          printf("    -   |");
-      }
-      else
-      {
-        uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
-                                              RemappedIndex(j, false),
-                                              &linkType, &hopCount));
-        printf("%s%s-%d%s",
-               outputToCsv ? "" : " ",
-               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
-               hopCount, outputToCsv ? "," : " |");
-      }
-    }
-    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));
-
-    int numDeviceCUs = 0;
-    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
-
-    if (outputToCsv)
-      printf("%s,%d,%d,", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
-    else
-    {
-      printf(" %11s | %4d | %-12d |", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
-
-      bool isFirst = true;
-      for (auto x : dmaEngineIdsPerDevice[deviceIdx])
-      {
-        if (isFirst) isFirst = false; else printf(",");
-        printf("%d", x);
-      }
-      printf("\n");
-    }
-  }
-
-  // Check that large BAR is enabled on all GPUs
-  for (int i = 0; i < numGpuDevices; i++) {
-    int const deviceIdx = RemappedIndex(i, false);
-    int isLargeBar = 0;
-    HIP_CALL(hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, deviceIdx));
-    if (!isLargeBar)
-      printf("[WARN] Large BAR is not enabled for GPU %d in BIOS.  This may result in segfaults\n", i);
-  }
-#endif
-}
-
-void ParseMemType(EnvVars const& ev, std::string const& token,
-                  std::vector<MemType>& memTypes, std::vector<int>& memIndices)
-{
-  char typeChar;
-  int offset = 0, devIndex, inc;
-  bool found = false;
-
-  memTypes.clear();
-  memIndices.clear();
-  while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
-  {
-    offset += inc;
-    MemType memType = CharToMemType(typeChar);
-
-    if (IsCpuType(memType) && (devIndex < 0 || devIndex >= ev.numCpuDevices))
-    {
-      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", ev.numCpuDevices-1, devIndex);
-      exit(1);
-    }
-    if (IsGpuType(memType) && (devIndex < 0 || devIndex >= ev.numGpuDevices))
-    {
-      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", ev.numGpuDevices-1, devIndex);
-      exit(1);
-    }
-
-    found = true;
-    if (memType != MEM_NULL)
-    {
-      memTypes.push_back(memType);
-      memIndices.push_back(devIndex);
-    }
-  }
-  if (!found)
-  {
-    printf("[ERROR] Unable to parse memory type token %s.  Expected one of %s followed by an index\n",
-           token.c_str(), MemTypeStr);
-    exit(1);
-  }
-}
-
-void ParseExeType(EnvVars const& ev, std::string const& token,
-                  ExeType &exeType, int& exeIndex, int& exeSubIndex)
-{
-  char typeChar;
-  exeSubIndex = -1;
-  int numTokensParsed = sscanf(token.c_str(), " %c%d.%d", &typeChar, &exeIndex, &exeSubIndex);
-  if (numTokensParsed < 2)
-  {
-    printf("[ERROR] Unable to parse valid executor token (%s).  Exepected one of %s followed by an index\n",
-           token.c_str(), ExeTypeStr);
-    exit(1);
-  }
-  exeType = CharToExeType(typeChar);
-
-  if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= ev.numCpuDevices))
-  {
-    printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", ev.numCpuDevices-1, exeIndex);
-    exit(1);
-  }
-  if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= ev.numGpuDevices))
-  {
-    printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", ev.numGpuDevices-1, exeIndex);
-    exit(1);
-  }
-  if (exeType == EXE_GPU_GFX && exeSubIndex != -1)
-  {
-    int const idx = RemappedIndex(exeIndex, false);
-    if (ev.xccIdsPerDevice[idx].count(exeSubIndex) == 0)
-    {
-      printf("[ERROR] GPU %d does not have subIndex %d\n", exeIndex, exeSubIndex);
-      exit(1);
-    }
-  }
-}
-
-// Helper function to parse a list of Transfer definitions
-void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transfers)
-{
-  // Replace any round brackets or '->' with spaces,
-  for (int i = 1; line[i]; i++)
-    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
-
-  transfers.clear();
-
-  int numTransfers = 0;
-  std::istringstream iss(line);
-  iss >> numTransfers;
-  if (iss.fail()) return;
-
-  std::string exeMem;
-  std::string srcMem;
-  std::string dstMem;
-
-  // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
-  // otherwise read triples (srcMem, exeMem, dstMem)
-  bool const advancedMode = (numTransfers < 0);
-  numTransfers = abs(numTransfers);
-
-  int numSubExecs;
-  if (!advancedMode)
-  {
-    iss >> numSubExecs;
-    if (numSubExecs < 0 || iss.fail())
-    {
-      printf("Parsing error: Number of blocks to use (%d) must be non-negative\n", numSubExecs);
-      exit(1);
-    }
-  }
-
-  size_t numBytes = 0;
-  for (int i = 0; i < numTransfers; i++)
-  {
-    Transfer transfer;
-    transfer.numBytes = 0;
-    transfer.numBytesActual = 0;
-    if (!advancedMode)
-    {
-      iss >> srcMem >> exeMem >> dstMem;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
-        exit(1);
-      }
-    }
-    else
-    {
-      std::string numBytesToken;
-      iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
-        exit(1);
-      }
-      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
-      {
-        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
-        exit(1);
-      }
-      char units = numBytesToken.back();
-      switch (toupper(units))
-      {
-      case 'K': numBytes *= 1024; break;
-      case 'M': numBytes *= 1024*1024; break;
-      case 'G': numBytes *= 1024*1024*1024; break;
-      }
-    }
-
-    ParseMemType(ev, srcMem, transfer.srcType, transfer.srcIndex);
-    ParseMemType(ev, dstMem, transfer.dstType, transfer.dstIndex);
-    ParseExeType(ev, exeMem, transfer.exeType, transfer.exeIndex, transfer.exeSubIndex);
-
-    transfer.numSrcs = (int)transfer.srcType.size();
-    transfer.numDsts = (int)transfer.dstType.size();
-    if (transfer.numSrcs == 0 && transfer.numDsts == 0)
-    {
-      printf("[ERROR] Transfer must have at least one src or dst\n");
-      exit(1);
-    }
-
-    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs != 1 || transfer.numDsts != 1))
-    {
-      printf("[ERROR] GPU DMA executor can only be used for single source + single dst copies\n");
-      exit(1);
-    }
-
-    transfer.numSubExecs = numSubExecs;
-    transfer.numBytes = numBytes;
-    transfers.push_back(transfer);
-  }
-}
-
-void EnablePeerAccess(int const deviceId, int const peerDeviceId)
-{
-  int canAccess;
-  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
-  if (!canAccess)
-  {
-    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
-    exit(1);
-  }
-  HIP_CALL(hipSetDevice(deviceId));
-  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
-  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
-  {
-    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
-           deviceId, peerDeviceId, hipGetErrorString(error));
-    exit(1);
-  }
-}
-
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
-{
-  if (numBytes == 0)
-  {
-    printf("[ERROR] Unable to allocate 0 bytes\n");
-    exit(1);
-  }
-  *memPtr = nullptr;
-  if (IsCpuType(memType))
-  {
-    // Set numa policy prior to call to hipHostMalloc
-    numa_set_preferred(devIndex);
-
-    // Allocate host-pinned memory (should respect NUMA mem policy)
-    if (memType == MEM_CPU_FINE)
-    {
-#if defined (__NVCC__)
-      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
-      exit(1);
-#else
-      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
-#endif
-    }
-    else if (memType == MEM_CPU)
-    {
-#if defined (__NVCC__)
-      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
-#else
-      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
-#endif
-      {
-        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
-        exit(1);
-      }
-    }
-    else if (memType == MEM_CPU_UNPINNED)
-    {
-      *memPtr = numa_alloc_onnode(numBytes, devIndex);
-    }
-
-    // Check that the allocated pages are actually on the correct NUMA node
-    memset(*memPtr, 0, numBytes);
-
-    CheckPages((char*)*memPtr, numBytes, devIndex);
-
-    // Reset to default numa mem policy
-    numa_set_preferred(-1);
-  }
-  else if (IsGpuType(memType))
-  {
-    if (memType == MEM_GPU)
-    {
-      // Allocate GPU memory on appropriate device
-      HIP_CALL(hipSetDevice(devIndex));
-      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
-    }
-    else if (memType == MEM_GPU_FINE)
-    {
-#if defined (__NVCC__)
-      printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
-      exit(1);
-#else
-      HIP_CALL(hipSetDevice(devIndex));
-      int flag = hipDeviceMallocUncached;
-      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
-#endif
-    }
-    else if (memType == MEM_MANAGED)
-    {
-      HIP_CALL(hipSetDevice(devIndex));
-      HIP_CALL(hipMallocManaged((void**)memPtr, numBytes));
-    }
-    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
-    HIP_CALL(hipDeviceSynchronize());
-  }
-  else
-  {
-    printf("[ERROR] Unsupported memory type %d\n", memType);
-    exit(1);
-  }
-}
-
-void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
-{
-  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null CPU pointer for %lu bytes.  Skipping hipHostFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipHostFree(memPtr));
-  }
-  else if (memType == MEM_CPU_UNPINNED)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes.  Skipping numa_free\n", bytes);
-      return;
-    }
-    numa_free(memPtr, bytes);
-  }
-  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipFree(memPtr));
-  }
-  else if (memType == MEM_MANAGED)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null managed pointer for %lu bytes. Skipping hipMFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipFree(memPtr));
-  }
-}
-
-void CheckPages(char* array, size_t numBytes, int targetId)
-{
-  unsigned long const pageSize = getpagesize();
-  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
-
-  std::vector<void *> pages(numPages);
-  std::vector<int> status(numPages);
-
-  pages[0] = array;
-  for (int i = 1; i < numPages; i++)
-  {
-    pages[i] = (char*)pages[i-1] + pageSize;
-  }
-
-  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
-  if (retCode)
-  {
-    printf("[ERROR] Unable to collect page info\n");
-    exit(1);
-  }
-
-  size_t mistakeCount = 0;
-  for (int i = 0; i < numPages; i++)
-  {
-    if (status[i] < 0)
-    {
-      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
-      exit(1);
-    }
-    if (status[i] != targetId) mistakeCount++;
-  }
-  if (mistakeCount > 0)
-  {
-    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
-    exit(1);
-  }
-}
-
-uint32_t GetId(uint32_t hwId)
-{
-#if defined(__NVCC_)
-  return hwId;
-#else
-  // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
-  int const shId = (hwId >> 12) &  1;
-  int const cuId = (hwId >>  8) & 15;
-  int const seId = (hwId >> 13) &  3;
-  return (shId << 5) + (cuId << 2) + seId;
-#endif
-}
-
-void RunTransfer(EnvVars const& ev, int const iteration,
-                 ExecutorInfo& exeInfo, int const transferIdx)
-{
-  Transfer* transfer = exeInfo.transfers[transferIdx];
-
-  if (transfer->exeType == EXE_GPU_GFX)
-  {
-    // Switch to executing GPU
-    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-    HIP_CALL(hipSetDevice(exeIndex));
-
-    hipStream_t& stream     = exeInfo.streams[transferIdx];
-    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
-    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
-
-    // Figure out how many threadblocks to use.
-    // In single stream mode, all the threadblocks for this GPU are launched
-    // Otherwise, just launch the threadblocks associated with this single Transfer
-    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
-    int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);
-    dim3 const gridSize(numXCCs, numBlocksToRun, 1);
-    dim3 const blockSize(ev.gfxBlockSize, 1, 1);
-
-#if defined(__NVCC__)
-    HIP_CALL(hipEventRecord(startEvent, stream));
-    GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1]
-      <<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
-    HIP_CALL(hipEventRecord(stopEvent, stream));
-#else
-    hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1],
-                          gridSize, blockSize,
-                          ev.sharedMemBytes, stream,
-                          startEvent, stopEvent,
-                          0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
-#endif
-    // Synchronize per iteration, unless in single sync mode, in which case
-    // synchronize during last warmup / last actual iteration
-    HIP_CALL(hipStreamSynchronize(stream));
-
-    if (iteration >= 0)
-    {
-      // Record GPU timing
-      float gpuDeltaMsec;
-      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-
-      if (ev.useSingleStream)
-      {
-        // Figure out individual timings for Transfers that were all launched together
-        for (Transfer* currTransfer : exeInfo.transfers)
-        {
-          long long minStartCycle = std::numeric_limits<long long>::max();
-          long long maxStopCycle  = std::numeric_limits<long long>::min();
-
-          std::set<std::pair<int,int>> CUs;
-          for (auto subExecIdx : currTransfer->subExecIdx)
-          {
-            minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
-            maxStopCycle  = std::max(maxStopCycle,  exeInfo.subExecParamGpu[subExecIdx].stopCycle);
-            if (ev.showIterations)
-              CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
-                                        GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
-          }
-          int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
-          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
-          currTransfer->transferTime += iterationTimeMs;
-          if (ev.showIterations)
-          {
-            currTransfer->perIterationTime.push_back(iterationTimeMs);
-            currTransfer->perIterationCUs.push_back(CUs);
-          }
-        }
-        exeInfo.totalTime += gpuDeltaMsec;
-      }
-      else
-      {
-        transfer->transferTime += gpuDeltaMsec;
-        if (ev.showIterations)
-        {
-          transfer->perIterationTime.push_back(gpuDeltaMsec);
-          std::set<std::pair<int,int>> CUs;
-          for (int i = 0; i < transfer->numSubExecs; i++)
-            CUs.insert(std::make_pair(transfer->subExecParamGpuPtr[i].xccId,
-                                      GetId(transfer->subExecParamGpuPtr[i].hwId)));
-          transfer->perIterationCUs.push_back(CUs);
-        }
-      }
-    }
-  }
-  else if (transfer->exeType == EXE_GPU_DMA)
-  {
-    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-
-    int subIterations = 0;
-    if (transfer->exeSubIndex == -1 && !ev.useHsaDma) {
-      // Switch to executing GPU
-      HIP_CALL(hipSetDevice(exeIndex));
-      hipStream_t& stream     = exeInfo.streams[transferIdx];
-      hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
-      hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
-
-      HIP_CALL(hipEventRecord(startEvent, stream));
-      do {
-        HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
-                                transfer->numBytesActual, hipMemcpyDefault,
-                                stream));
-      } while (++subIterations != ev.numSubIterations);
-      HIP_CALL(hipEventRecord(stopEvent, stream));
-      HIP_CALL(hipStreamSynchronize(stream));
-
-      // Record time based on HIP events
-      if (iteration >= 0) {
-        float gpuDeltaMsec;
-        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-        transfer->transferTime += gpuDeltaMsec;
-        if (ev.showIterations)
-          transfer->perIterationTime.push_back(gpuDeltaMsec);
-      }
-    } else {
-#if defined(__NVCC__)
-      printf("[ERROR] CUDA does not support targeting specific DMA engines\n");
-      exit(1);
-#else
-      // Use hsa_amd_memory copy (either targeted or untargeted)
-      auto cpuStart = std::chrono::high_resolution_clock::now();
-
-      do {
-        // Atomically set signal to 1
-        HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
-        if (ev.useHsaDma) {
-          HSA_CALL(hsa_amd_memory_async_copy(transfer->dstMem[0], transfer->dstAgent,
-                                             transfer->srcMem[0], transfer->srcAgent,
-                                             transfer->numBytesActual, 0, NULL,
-                                             transfer->signal));
-        } else {
-          HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
-                                                       transfer->srcMem[0], transfer->srcAgent,
-                                                       transfer->numBytesActual, 0, NULL,
-                                                       transfer->signal,
-                                                       transfer->sdmaEngineId, true));
-        }
-        // Wait for SDMA transfer to complete
-        while(hsa_signal_wait_scacquire(transfer->signal,
-                                        HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
-                                        HSA_WAIT_STATE_ACTIVE) >= 1);
-      } while (++subIterations < ev.numSubIterations);
-
-      if (iteration >= 0) {
-        // Record GPU timing
-        auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-        double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-        transfer->transferTime += deltaMsec;
-        if (ev.showIterations)
-          transfer->perIterationTime.push_back(deltaMsec);
-      }
-#endif
-    }
-  }
-  else if (transfer->exeType == EXE_CPU) // CPU execution agent
-  {
-    // Force this thread and all child threads onto correct NUMA node
-    int const exeIndex = RemappedIndex(transfer->exeIndex, true);
-    if (numa_run_on_node(exeIndex))
-    {
-      printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
-      exit(1);
-    }
-
-    std::vector<std::thread> childThreads;
-
-    int subIteration = 0;
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    do {
-      // Launch each subExecutor in child-threads to perform memcopies
-      for (int i = 0; i < transfer->numSubExecs; ++i)
-        childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
-
-      // Wait for child-threads to finish
-      for (int i = 0; i < transfer->numSubExecs; ++i)
-        childThreads[i].join();
-      childThreads.clear();
-    } while (++subIteration != ev.numSubIterations);
-
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-
-    // Record time if not a warmup iteration
-    if (iteration >= 0)
-    {
-      double const delta = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
-      transfer->transferTime += delta;
-      if (ev.showIterations)
-        transfer->perIterationTime.push_back(delta);
-    }
-  }
-}
-
-void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
-{
-  ev.DisplayP2PBenchmarkEnvVars();
-
-  char const separator = ev.outputToCsv ? ',' : ' ';
-  printf("Bytes Per Direction%c%lu\n", separator, N * sizeof(float));
-
-  // Collect the number of available CPUs/GPUs on this machine
-  int const numCpus    = ev.numCpuDevices;
-  int const numGpus    = ev.numGpuDevices;
-  int const numDevices = numCpus + numGpus;
-
-  // Enable peer to peer for each GPU
-  for (int i = 0; i < numGpus; i++)
-    for (int j = 0; j < numGpus; j++)
-      if (i != j) EnablePeerAccess(i, j);
-
-  // Perform unidirectional / bidirectional
-  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
-  {
-    if (ev.p2pMode == 1 && isBidirectional == 1 ||
-        ev.p2pMode == 2 && isBidirectional == 0) continue;
-
-    printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
-           ev.useRemoteRead ? "Remote" : "Local",
-           ev.useRemoteRead ? "Local" : "Remote",
-           ev.useDmaCopy    ? "DMA"   : "GFX");
-
-    // Print header
-    if (isBidirectional)
-    {
-      printf("%12s", "SRC\\DST");
-    }
-    else
-    {
-      if (ev.useRemoteRead)
-        printf("%12s", "SRC\\EXE+DST");
-      else
-        printf("%12s", "SRC+EXE\\DST");
-    }
-    if (ev.outputToCsv) printf(",");
-    for (int i = 0; i < numCpus; i++)
-    {
-      printf("%7s %02d", "CPU", i);
-      if (ev.outputToCsv) printf(",");
-    }
-    if (numCpus > 0) printf("   ");
-    for (int i = 0; i < numGpus; i++)
-    {
-      printf("%7s %02d", "GPU", i);
-      if (ev.outputToCsv) printf(",");
-    }
-    printf("\n");
-
-    double avgBwSum[2][2] = {};
-    int    avgCount[2][2] = {};
-
-    ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
-    // Loop over all possible src/dst pairs
-    for (int src = 0; src < numDevices; src++)
-    {
-      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
-      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
-      MemType const srcTypeActual = ((ev.useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
-                                     (ev.useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
-                                                                               srcType);
-      std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
-      std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
-      std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
-      std::vector<std::vector<double>> stdDev(isBidirectional + 1);
-
-      if (src == numCpus && src != 0) printf("\n");
-      for (int dst = 0; dst < numDevices; dst++)
-      {
-        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
-        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
-        MemType const dstTypeActual = ((ev.useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
-                                       (ev.useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
-                                                                                 dstType);
-        // Prepare Transfers
-        std::vector<Transfer> transfers(isBidirectional + 1);
-
-        // SRC -> DST
-        transfers[0].numBytes = N * sizeof(float);
-        transfers[0].srcType.push_back(srcTypeActual);
-        transfers[0].dstType.push_back(dstTypeActual);
-        transfers[0].srcIndex.push_back(srcIndex);
-        transfers[0].dstIndex.push_back(dstIndex);
-        transfers[0].numSrcs = transfers[0].numDsts = 1;
-        transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
-        transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
-        transfers[0].exeSubIndex = -1;
-        transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-
-        // DST -> SRC
-        if (isBidirectional)
-        {
-          transfers[1].numBytes = N * sizeof(float);
-          transfers[1].numSrcs = transfers[1].numDsts = 1;
-          transfers[1].srcType.push_back(dstTypeActual);
-          transfers[1].dstType.push_back(srcTypeActual);
-          transfers[1].srcIndex.push_back(dstIndex);
-          transfers[1].dstIndex.push_back(srcIndex);
-          transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
-          transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
-          transfers[1].exeSubIndex = -1;
-          transfers[1].numSubExecs = IsGpuType(transfers[1].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-        }
-
-        bool skipTest = false;
-
-        // Abort if executing on NUMA node with no CPUs
-        for (int i = 0; i <= isBidirectional; i++)
-        {
-          if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
-          {
-            skipTest = true;
-            break;
-          }
-
-#if defined(__NVCC__)
-          // NVIDIA platform cannot access GPU memory directly from CPU executors
-          if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
-          {
-            skipTest = true;
-            break;
-          }
-#endif
-        }
-
-        if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;
-
-        if (!skipTest)
-        {
-          ExecuteTransfers(ev, 0, N, transfers, false);
-
-          for (int dir = 0; dir <= isBidirectional; dir++)
-          {
-            double const avgTime = transfers[dir].transferTime;
-            double const avgBw   = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f;
-            avgBandwidth[dir].push_back(avgBw);
-
-            if (!(srcType == dstType && srcIndex == dstIndex))
-            {
-              avgBwSum[srcType][dstType] += avgBw;
-              avgCount[srcType][dstType]++;
-            }
-
-            if (ev.showIterations)
-            {
-              double minTime = transfers[dir].perIterationTime[0];
-              double maxTime = transfers[dir].perIterationTime[0];
-              double varSum  = 0;
-              for (int i = 0; i < transfers[dir].perIterationTime.size(); i++)
-              {
-                minTime = std::min(minTime, transfers[dir].perIterationTime[i]);
-                maxTime = std::max(maxTime, transfers[dir].perIterationTime[i]);
-                double const bw  = (transfers[dir].numBytesActual / 1.0E9) / transfers[dir].perIterationTime[i] * 1000.0f;
-                double const delta = (avgBw - bw);
-                varSum += delta * delta;
-              }
-              double const minBw = (transfers[dir].numBytesActual / 1.0E9) / maxTime * 1000.0f;
-              double const maxBw = (transfers[dir].numBytesActual / 1.0E9) / minTime * 1000.0f;
-              double const stdev = sqrt(varSum / transfers[dir].perIterationTime.size());
-              minBandwidth[dir].push_back(minBw);
-              maxBandwidth[dir].push_back(maxBw);
-              stdDev[dir].push_back(stdev);
-            }
-          }
-        }
-        else
-        {
-          for (int dir = 0; dir <= isBidirectional; dir++)
-          {
-            avgBandwidth[dir].push_back(0);
-            minBandwidth[dir].push_back(0);
-            maxBandwidth[dir].push_back(0);
-            stdDev[dir].push_back(-1.0);
-          }
-        }
-      }
-
-      for (int dir = 0; dir <= isBidirectional; dir++)
-      {
-        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
-        if (ev.outputToCsv) printf(",");
-
-        for (int dst = 0; dst < numDevices; dst++)
-        {
-          if (dst == numCpus && dst != 0) printf("   ");
-          double const avgBw = avgBandwidth[dir][dst];
-
-          if (avgBw == 0.0)
-            printf("%10s", "N/A");
-          else
-            printf("%10.2f", avgBw);
-          if (ev.outputToCsv) printf(",");
-        }
-        printf("\n");
-
-        if (ev.showIterations)
-        {
-          // minBw
-          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
-          if (ev.outputToCsv) printf(",");
-          for (int i = 0; i < numDevices; i++)
-          {
-            double const minBw = minBandwidth[dir][i];
-            if (i == numCpus && i != 0) printf("   ");
-            if (minBw == 0.0)
-              printf("%10s", "N/A");
-            else
-              printf("%10.2f", minBw);
-            if (ev.outputToCsv) printf(",");
-          }
-          printf("\n");
-
-          // maxBw
-          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
-          if (ev.outputToCsv) printf(",");
-          for (int i = 0; i < numDevices; i++)
-          {
-            double const maxBw = maxBandwidth[dir][i];
-            if (i == numCpus && i != 0) printf("   ");
-            if (maxBw == 0.0)
-              printf("%10s", "N/A");
-            else
-              printf("%10.2f", maxBw);
-            if (ev.outputToCsv) printf(",");
-          }
-          printf("\n");
-
-          // stddev
-          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
-          if (ev.outputToCsv) printf(",");
-          for (int i = 0; i < numDevices; i++)
-          {
-            double const sd = stdDev[dir][i];
-            if (i == numCpus && i != 0) printf("   ");
-            if (sd == -1.0)
-              printf("%10s", "N/A");
-            else
-              printf("%10.2f", sd);
-            if (ev.outputToCsv) printf(",");
-          }
-          printf("\n");
-        }
-        fflush(stdout);
-      }
-
-      if (isBidirectional)
-      {
-        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
-        if (ev.outputToCsv) printf(",");
-        for (int dst = 0; dst < numDevices; dst++)
-        {
-          double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
-          if (dst == numCpus && dst != 0) printf("   ");
-          if (sumBw == 0.0)
-            printf("%10s", "N/A");
-          else
-            printf("%10.2f", sumBw);
-          if (ev.outputToCsv) printf(",");
-        }
-        printf("\n");
-        if (src < numDevices - 1) printf("\n");
-      }
-    }
-
-    if (!ev.outputToCsv)
-    {
-      printf("                         ");
-      for (int srcType : {MEM_CPU, MEM_GPU})
-        for (int dstType : {MEM_CPU, MEM_GPU})
-          printf("  %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
-      printf("\n");
-
-      printf("Averages (During %s):",  isBidirectional ? " BiDir" : "UniDir");
-      for (int srcType : {MEM_CPU, MEM_GPU})
-        for (int dstType : {MEM_CPU, MEM_GPU})
-        {
-          if (avgCount[srcType][dstType])
-            printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
-          else
-            printf("%10s", "N/A");
-        }
-      printf("\n\n");
-    }
-  }
-}
-
-void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int const maxSubExecs)
-{
-  ev.DisplayEnvVars();
-
-  // Collect the number of available CPUs/GPUs on this machine
-  int const numCpus    = ev.numCpuDevices;
-  int const numGpus    = ev.numGpuDevices;
-  int const numDevices = numCpus + numGpus;
-
-  // Enable peer to peer for each GPU
-  for (int i = 0; i < numGpus; i++)
-    for (int j = 0; j < numGpus; j++)
-      if (i != j) EnablePeerAccess(i, j);
-
-  char separator = (ev.outputToCsv ? ',' : ' ');
-
-  std::vector<Transfer> transfers(1);
-  Transfer& t = transfers[0];
-  t.numBytes = N * sizeof(float);
-  t.numSrcs  = 1;
-  t.numDsts  = 1;
-  t.exeType  = EXE_GPU_GFX;
-  t.exeIndex = exeIndex;
-  t.exeSubIndex = -1;
-  t.srcType.resize(1, MEM_GPU);
-  t.dstType.resize(1, MEM_GPU);
-  t.srcIndex.resize(1);
-  t.dstIndex.resize(1);
-
-  printf("GPU-GFX Scaling benchmark:\n");
-  printf("==========================\n");
-  printf("- Copying %lu bytes from GPU %d to other devices\n", t.numBytes, exeIndex);
-  printf("- All numbers reported as GB/sec\n\n");
-
-  printf("NumCUs");
-  for (int i = 0; i < numDevices; i++)
-    printf("%c  %s%02d     ", separator, i < numCpus ? "CPU" : "GPU", i < numCpus ? i : i - numCpus);
-  printf("\n");
-
-  std::vector<std::pair<double, int>> bestResult(numDevices);
-  for (int numSubExec = 1; numSubExec <= maxSubExecs; numSubExec++)
-  {
-    t.numSubExecs = numSubExec;
-    printf("%4d  ", numSubExec);
-
-    for (int i = 0; i < numDevices; i++)
-    {
-      t.dstType[0]  = i < numCpus ? MEM_CPU : MEM_GPU;
-      t.dstIndex[0] = i < numCpus ? i : i - numCpus;
-
-      ExecuteTransfers(ev, 0, N, transfers, false);
-      printf("%c%7.2f     ", separator, t.transferBandwidth);
-
-      if (t.transferBandwidth > bestResult[i].first)
-      {
-        bestResult[i].first  = t.transferBandwidth;
-        bestResult[i].second = numSubExec;
-      }
-    }
-    printf("\n");
-  }
-
-  printf(" Best ");
-  for (int i = 0; i < numDevices; i++)
-  {
-    printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
-  }
-  printf("\n");
-}
-
-void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs)
-{
-  ev.DisplayA2AEnvVars();
-
-  // Collect the number of GPU devices to use
-  int const numGpus = ev.numGpuDevices;
-
-  // Enable peer to peer for each GPU
-  for (int i = 0; i < numGpus; i++)
-    for (int j = 0; j < numGpus; j++)
-      if (i != j) EnablePeerAccess(i, j);
-
-  char separator = (ev.outputToCsv ? ',' : ' ');
-
-  Transfer transfer;
-  transfer.numBytes    = numBytesPerTransfer;
-  transfer.numSubExecs = numSubExecs;
-  transfer.numSrcs     = ev.a2aMode == 2 ? 0 : 1;
-  transfer.numDsts     = ev.a2aMode == 1 ? 0 : 1;
-  transfer.exeType     = (ev.useDmaCopy && transfer.numSrcs == 1 && transfer.numDsts == 1) ? EXE_GPU_DMA : EXE_GPU_GFX;
-  transfer.exeSubIndex = -1;
-  transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-  transfer.dstType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-  transfer.srcIndex.resize(1);
-  transfer.dstIndex.resize(1);
-
-  std::vector<Transfer> transfers;
-  for (int i = 0; i < numGpus; i++)
-  {
-    transfer.srcIndex[0] = i;
-    for (int j = 0; j < numGpus; j++)
-    {
-      transfer.dstIndex[0] = j;
-      transfer.exeIndex    = (ev.useRemoteRead ? j : i);
-
-      if (ev.a2aDirect)
-      {
-        if (i == j) continue;
-
-#if !defined(__NVCC__)
-        uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false),
-                                              RemappedIndex(j, false),
-                                              &linkType, &hopCount));
-        if (hopCount != 1) continue;
-#endif
-      }
-      transfers.push_back(transfer);
-    }
-  }
-
-  printf("GPU-GFX All-To-All benchmark:\n");
-  printf("==========================\n");
-  printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
-         numBytesPerTransfer, ev.a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
-  if (transfers.size() == 0) return;
-
-  double totalBandwidthCpu = 0;
-  ExecuteTransfers(ev, 0, numBytesPerTransfer / sizeof(float), transfers, !ev.hideEnv, &totalBandwidthCpu);
-
-  printf("\nSummary:\n");
-  printf("==========================================================\n");
-  printf("SRC\\DST ");
-  for (int dst = 0; dst < numGpus; dst++)
-    printf("%cGPU %02d    ", separator, dst);
-  printf("   %cSTotal     %cActual\n", separator, separator);
-
-  std::map<std::pair<int, int>, int> reIndex;
-  for (int i = 0; i < transfers.size(); i++)
-  {
-    Transfer const& t = transfers[i];
-    reIndex[std::make_pair(t.srcIndex[0], t.dstIndex[0])] = i;
-  }
-
-  double totalBandwidthGpu = 0.0;
-  double minExecutorBandwidth = std::numeric_limits<double>::max();
-  double maxExecutorBandwidth = 0.0;
-  std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
-  for (int src = 0; src < numGpus; src++)
-  {
-    double rowTotalBandwidth = 0;
-    double executorBandwidth = 0;
-    printf("GPU %02d", src);
-    for (int dst = 0; dst < numGpus; dst++)
-    {
-      if (reIndex.count(std::make_pair(src, dst)))
-      {
-        Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]];
-        colTotalBandwidth[dst]  += transfer.transferBandwidth;
-        rowTotalBandwidth       += transfer.transferBandwidth;
-        totalBandwidthGpu       += transfer.transferBandwidth;
-        executorBandwidth        = std::max(executorBandwidth, transfer.executorBandwidth);
-        printf("%c%8.3f  ", separator, transfer.transferBandwidth);
-      }
-      else
-      {
-        printf("%c%8s  ", separator, "N/A");
-      }
-    }
-    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
-    minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
-    maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
-    colTotalBandwidth[numGpus] += rowTotalBandwidth;
-  }
-  printf("\nRTotal");
-  for (int dst = 0; dst < numGpus; dst++)
-  {
-    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
-  }
-  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus],
-         separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
-  printf("\n");
-
-  printf("Average   bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
-  printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
-  printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", totalBandwidthCpu);
-}
-
-void Transfer::PrepareSubExecParams(EnvVars const& ev)
-{
-  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
-  // Figure out the sub-array each subExecutor works on for this Transfer
-  // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
-  //   except the very last one, for alignment reasons
-  size_t const N              = this->numBytesActual / sizeof(float);
-  int    const initOffset     = ev.byteOffset / sizeof(float);
-  int    const targetMultiple = ev.blockBytes / sizeof(float);
-
-  // In some cases, there may not be enough data for all subExectors
-  int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
-  this->subExecParam.clear();
-  this->subExecParam.resize(this->numSubExecs);
-
-  size_t assigned = 0;
-  for (int i = 0; i < this->numSubExecs; ++i)
-  {
-    SubExecParam& p  = this->subExecParam[i];
-    p.numSrcs        = this->numSrcs;
-    p.numDsts        = this->numDsts;
-
-    if (ev.gfxSingleTeam && this->exeType == EXE_GPU_GFX)
-    {
-      p.N           = N;
-      p.teamSize    = this->numSubExecs;
-      p.teamIdx     = i;
-      for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset;
-      for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset;
-    }
-    else
-    {
-      int    const subExecLeft = std::max(0, maxSubExecToUse - i);
-      size_t const leftover    = N - assigned;
-      size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
-
-      p.N           = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
-      p.teamSize    = 1;
-      p.teamIdx     = 0;
-      for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset + assigned;
-      for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset + assigned;
-
-      assigned += p.N;
-    }
-
-    p.preferredXccId = -1;
-    if (ev.useXccFilter && this->exeType == EXE_GPU_GFX)
-    {
-      std::uniform_int_distribution<int> distribution(0, ev.xccIdsPerDevice[this->exeIndex].size() - 1);
-
-      // Use this tranfer's executor subIndex if set
-      if (this->exeSubIndex != -1)
-      {
-        p.preferredXccId = this->exeSubIndex;
-      }
-      else if (this->numDsts >= 1 && IsGpuType(this->dstType[0]))
-      {
-        p.preferredXccId = ev.prefXccTable[this->exeIndex][this->dstIndex[0]];
-      }
-
-      if (p.preferredXccId == -1)
-      {
-        p.preferredXccId = distribution(*ev.generator);
-      }
-    }
-
-    if (ev.enableDebug)
-    {
-      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
-             this->transferIndex, i, p.N, assigned, assigned + p.N);
-    }
-
-    p.startCycle = 0;
-    p.stopCycle  = 0;
-  }
-
-  this->transferTime = 0.0;
-  this->perIterationTime.clear();
-}
-
-void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
-{
-  size_t N = buffer.size();
-  if (bufferIdx >= 0)
-  {
-    size_t patternLen = ev.fillPattern.size();
-    if (patternLen > 0)
-    {
-      for (size_t i = 0; i < N; ++i)
-        buffer[i] = ev.fillPattern[i % patternLen];
-    }
-    else
-    {
-      for (size_t i = 0; i < N; ++i)
-        buffer[i] = PrepSrcValue(bufferIdx, i);
-    }
-  }
-  else // Destination buffer
-  {
-    if (this->numSrcs == 0)
-    {
-      // Note: 0x75757575 = 13323083.0
-      memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
-    }
-    else
-    {
-      PrepareReference(ev, buffer, 0);
-
-      if (this->numSrcs > 1)
-      {
-        std::vector<float> temp(N);
-        for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
-        {
-          PrepareReference(ev, temp, srcIdx);
-          for (int i = 0; i < N; ++i)
-          {
-            buffer[i] += temp[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-bool Transfer::PrepareSrc(EnvVars const& ev)
-{
-  if (this->numSrcs == 0) return true;
-  size_t const N = this->numBytesActual / sizeof(float);
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  std::vector<float> reference(N);
-  for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
-  {
-    float* srcPtr = this->srcMem[srcIdx] + initOffset;
-    PrepareReference(ev, reference, srcIdx);
-
-    // Initialize source memory array with reference pattern
-    if (IsGpuType(this->srcType[srcIdx]))
-    {
-      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
-      HIP_CALL(hipSetDevice(deviceIdx));
-      if (ev.usePrepSrcKernel)
-        PrepSrcDataKernel<<<32, ev.gfxBlockSize>>>(srcPtr, N, srcIdx);
-      else
-        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-    }
-    else if (IsCpuType(this->srcType[srcIdx]))
-    {
-      memcpy(srcPtr, reference.data(), this->numBytesActual);
-    }
-
-    // Perform check just to make sure that data has been copied properly
-    float* srcCheckPtr = srcPtr;
-    std::vector<float> srcCopy(N);
-    if (IsGpuType(this->srcType[srcIdx]))
-    {
-      if (!ev.validateDirect)
-      {
-        HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
-        HIP_CALL(hipDeviceSynchronize());
-        srcCheckPtr = srcCopy.data();
-      }
-    }
-
-    for (size_t i = 0; i < N; ++i)
-    {
-      if (reference[i] != srcCheckPtr[i])
-      {
-        printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
-#if !defined(__NVCC__)
-        float const val = this->srcMem[srcIdx][initOffset + i];
-        printf("[ERROR] SRC %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-               srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
-#else
-        printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
-#endif
-        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
-        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
-               this->transferIndex,
-               this->SrcToStr().c_str(),
-               ExeTypeStr[this->exeType], this->exeIndex,
-               this->numSubExecs,
-               this->DstToStr().c_str());
-        printf("[ERROR] Possible cause is misconfigured IOMMU (AMD Instinct cards require amd_iommu=on and iommu=pt)\n");
-        printf("[ERROR] Please see https://community.amd.com/t5/knowledge-base/iommu-advisory-for-amd-instinct/ta-p/484601 for more details\n");
-        if (!ev.continueOnError)
-          exit(1);
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-void Transfer::ValidateDst(EnvVars const& ev)
-{
-  if (this->numDsts == 0) return;
-  size_t const N = this->numBytesActual / sizeof(float);
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  std::vector<float> reference(N);
-  PrepareReference(ev, reference, -1);
-
-  std::vector<float> hostBuffer(N);
-  for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
-  {
-    float* output;
-    if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
-    {
-      output = this->dstMem[dstIdx] + initOffset;
-    }
-    else
-    {
-      int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
-      HIP_CALL(hipSetDevice(deviceIdx));
-      HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-      output = hostBuffer.data();
-    }
-
-    for (size_t i = 0; i < N; ++i)
-    {
-      if (reference[i] != output[i])
-      {
-        printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
-        for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
-        {
-          float srcVal;
-          HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
-#if !defined(__NVCC__)
-          float val = this->srcMem[srcIdx][initOffset + i];
-          printf("[ERROR] SRC %02dD  value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-                 srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
-#else
-          printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
-#endif
-        }
-        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
-#if !defined(__NVCC__)
-        float dstVal = this->dstMem[dstIdx][initOffset + i];
-        printf("[ERROR] DST %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-               dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
-#else
-        printf("[ERROR] DST %02d   value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
-#endif
-        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
-               this->transferIndex,
-               this->SrcToStr().c_str(),
-               ExeTypeStr[this->exeType], this->exeIndex,
-               this->numSubExecs,
-               this->DstToStr().c_str());
-        if (!ev.continueOnError)
-          exit(1);
-        else
-          break;
-      }
-    }
-  }
-}
-
-std::string Transfer::SrcToStr() const
-{
-  if (numSrcs == 0) return "N";
-  std::stringstream ss;
-  for (int i = 0; i < numSrcs; ++i)
-    ss << MemTypeStr[srcType[i]] << srcIndex[i];
-  return ss.str();
-}
-
-std::string Transfer::DstToStr() const
-{
-  if (numDsts == 0) return "N";
-  std::stringstream ss;
-  for (int i = 0; i < numDsts; ++i)
-    ss << MemTypeStr[dstType[i]] << dstIndex[i];
-  return ss.str();
-}
-
-void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
-{
-  char memType = ev.useFineGrain ? 'F' : 'G';
-  printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
-  printf("       | Local Read  | Local Write | Local Copy  | Remote Read | Remote Write| Remote Copy |\n");
-  printf("  #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
-         memType, localIdx, localIdx,
-         localIdx, memType, localIdx,
-         memType, localIdx, localIdx, memType, localIdx,
-         memType, remoteIdx, localIdx,
-         localIdx, memType, remoteIdx,
-         memType, localIdx, localIdx, memType, remoteIdx);
-  printf("|------|-------------|-------------|-------------|-------------|-------------|-------------|\n");
-
-  std::vector<Transfer> transfers(1);
-  Transfer& t   = transfers[0];
-  t.exeType     = EXE_GPU_GFX;
-  t.exeIndex    = localIdx;
-  t.exeSubIndex = -1;
-  t.numBytes    = numBytesPerTransfer;
-
-  for (int numCUs = 1; numCUs <= maxSubExecs; numCUs++)
-  {
-    t.numSubExecs = numCUs;
-
-    // Local Read
-    t.numSrcs = 1;
-    t.numDsts = 0;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.srcIndex[0] = localIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const localRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Local Write
-    t.numSrcs = 0;
-    t.numDsts = 1;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.dstIndex[0] = localIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const localWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Local Copy
-    t.numSrcs = 1;
-    t.numDsts = 1;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.srcIndex[0] = localIdx;
-    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.dstIndex[0] = localIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const localCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Remote Read
-    t.numSrcs = 1;
-    t.numDsts = 0;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.srcIndex[0] = remoteIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const remoteRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Remote Write
-    t.numSrcs = 0;
-    t.numDsts = 1;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.dstIndex[0] = remoteIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const remoteWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Remote Copy
-    t.numSrcs = 1;
-    t.numDsts = 1;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.srcIndex[0] = localIdx;
-    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.dstIndex[0] = remoteIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const remoteCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    printf("   %3d   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f  \n",
-           numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
-  }
-}
-
-void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-{
-  printf("Bytes to %s: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n",
-         ev.useRemoteRead ? "read" : "write", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
-
-  char sep = (ev.outputToCsv ? ',' : ' ');
-
-  for (int i = 0; i < ev.numGpuDevices; i++)
-  {
-    if (i == srcIdx) continue;
-    printf("   GPU %-3d  %c", i, sep);
-  }
-  printf("\n");
-  if (!ev.outputToCsv)
-  {
-    for (int i = 0; i < ev.numGpuDevices-1; i++)
-    {
-      printf("-------------");
-    }
-    printf("\n");
-  }
-
-  for (int p = minGpus; p <= maxGpus; p++)
-  {
-    for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
-    {
-      if (bitmask & (1<<srcIdx)) continue;
-      if (__builtin_popcount(bitmask) == p)
-      {
-        std::vector<Transfer> transfers;
-        for (int i = 0; i < ev.numGpuDevices; i++)
-        {
-          if (bitmask & (1<<i))
-          {
-            Transfer t;
-            t.exeType     = EXE_GPU_GFX;
-            t.exeSubIndex = -1;
-            t.numSubExecs = numSubExecs;
-            t.numBytes    = numBytesPerTransfer;
-
-            if (ev.useRemoteRead)
-            {
-              t.numSrcs  = 1;
-              t.numDsts  = 0;
-              t.exeIndex = i;
-              t.srcType.resize(1);
-              t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-              t.srcIndex.resize(1);
-              t.srcIndex[0] = srcIdx;
-            }
-            else
-            {
-              t.numSrcs     = 0;
-              t.numDsts     = 1;
-              t.exeIndex    = srcIdx;
-              t.dstType.resize(1);
-              t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-              t.dstIndex.resize(1);
-              t.dstIndex[0] = i;
-            }
-            transfers.push_back(t);
-          }
-        }
-        ExecuteTransfers(ev, 0, 0, transfers, false);
-
-        int counter = 0;
-        for (int i = 0; i < ev.numGpuDevices; i++)
-        {
-          if (bitmask & (1<<i))
-            printf("  %8.3f  %c", transfers[counter++].transferBandwidth, sep);
-          else if (i != srcIdx)
-            printf("            %c", sep);
-        }
-
-        printf(" %d %d", p, numSubExecs);
-        for (auto i = 0; i < transfers.size(); i++)
-        {
-          printf(" (%s %c%d %s)",
-                 transfers[i].SrcToStr().c_str(),
-                 ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
-                 transfers[i].DstToStr().c_str());
-        }
-        printf("\n");
-      }
-    }
-  }
-}
-
-void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-{
-  if (ev.useDmaCopy)
-    printf("Bytes to copy: %lu from GPU %d using DMA [Sweeping %d to %d parallel writes]\n",
-           numBytesPerTransfer, srcIdx, minGpus, maxGpus);
-  else
-    printf("Bytes to copy: %lu from GPU %d using GFX (%d CUs) [Sweeping %d to %d parallel writes]\n",
-           numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
-
-  char sep = (ev.outputToCsv ? ',' : ' ');
-
-  for (int i = 0; i < ev.numGpuDevices; i++)
-  {
-    if (i == srcIdx) continue;
-    printf("   GPU %-3d  %c", i, sep);
-  }
-  printf("\n");
-  if (!ev.outputToCsv)
-  {
-    for (int i = 0; i < ev.numGpuDevices-1; i++)
-    {
-      printf("-------------");
-    }
-    printf("\n");
-  }
-
-  for (int p = minGpus; p <= maxGpus; p++)
-  {
-    for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
-    {
-      if (bitmask & (1<<srcIdx)) continue;
-      if (__builtin_popcount(bitmask) == p)
-      {
-        std::vector<Transfer> transfers;
-        for (int i = 0; i < ev.numGpuDevices; i++)
-        {
-          if (bitmask & (1<<i))
-          {
-            Transfer t;
-            t.exeType     = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
-            t.exeSubIndex = -1;
-            t.numSubExecs = ev.useDmaCopy ? 1 : numSubExecs;
-            t.numBytes    = numBytesPerTransfer;
-
-            t.numSrcs     = 1;
-            t.numDsts     = 1;
-            t.exeIndex    = srcIdx;
-            t.srcType.resize(1);
-            t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-            t.srcIndex.resize(1);
-            t.srcIndex[0] = srcIdx;
-            t.dstType.resize(1);
-            t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-            t.dstIndex.resize(1);
-            t.dstIndex[0] = i;
-
-            transfers.push_back(t);
-          }
-        }
-        ExecuteTransfers(ev, 0, 0, transfers, false);
-
-        int counter = 0;
-        for (int i = 0; i < ev.numGpuDevices; i++)
-        {
-          if (bitmask & (1<<i))
-            printf("  %8.3f  %c", transfers[counter++].transferBandwidth, sep);
-          else if (i != srcIdx)
-            printf("            %c", sep);
-        }
-
-        printf(" %d %d", p, numSubExecs);
-        for (auto i = 0; i < transfers.size(); i++)
-        {
-          printf(" (%s %c%d %s)",
-                 transfers[i].SrcToStr().c_str(),
-                 ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
-                 transfers[i].DstToStr().c_str());
-        }
-        printf("\n");
-      }
-    }
-  }
-}
-
-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
-{
-  ev.DisplaySweepEnvVars();
-
-  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
-  std::vector<std::pair<ExeType, int>> exeList;
-  for (auto exe : ev.sweepExe)
-  {
-    ExeType const exeType = CharToExeType(exe);
-    if (IsGpuType(exeType))
-    {
-      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
-        exeList.push_back(std::make_pair(exeType, exeIndex));
-    }
-    else if (IsCpuType(exeType))
-    {
-      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
-      {
-        // Skip NUMA nodes that have no CPUs (e.g. CXL)
-        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
-        exeList.push_back(std::make_pair(exeType, exeIndex));
-      }
-    }
-  }
-  int numExes = exeList.size();
-
-  std::vector<std::pair<MemType, int>> srcList;
-  for (auto src : ev.sweepSrc)
-  {
-    MemType const srcType = CharToMemType(src);
-    int const numDevices = (srcType == MEM_NULL) ? 1 : IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
-
-    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
-      srcList.push_back(std::make_pair(srcType, srcIndex));
-  }
-  int numSrcs = srcList.size();
-
-
-  std::vector<std::pair<MemType, int>> dstList;
-  for (auto dst : ev.sweepDst)
-  {
-    MemType const dstType = CharToMemType(dst);
-    int const numDevices = (dstType == MEM_NULL) ? 1 : IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
-
-    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
-      dstList.push_back(std::make_pair(dstType, dstIndex));
-  }
-  int numDsts = dstList.size();
-
-  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
-  struct TransferInfo
-  {
-    MemType srcType; int srcIndex;
-    ExeType exeType; int exeIndex;
-    MemType dstType; int dstIndex;
-  };
-
-  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
-  bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
-
-  std::vector<TransferInfo> possibleTransfers;
-  TransferInfo tinfo;
-  for (int i = 0; i < numExes; ++i)
-  {
-    // Skip CPU executors if XGMI link must be used
-    if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
-    tinfo.exeType  = exeList[i].first;
-    tinfo.exeIndex = exeList[i].second;
-
-    bool isXgmiSrc  = false;
-    int  numHopsSrc = 0;
-    for (int j = 0; j < numSrcs; ++j)
-    {
-      if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
-      {
-        if (exeList[i].second != srcList[j].second)
-        {
-#if defined(__NVCC__)
-          isXgmiSrc = false;
-#else
-          uint32_t exeToSrcLinkType, exeToSrcHopCount;
-          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
-                                                RemappedIndex(srcList[j].second, false),
-                                                &exeToSrcLinkType,
-                                                &exeToSrcHopCount));
-          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
-#endif
-        }
-        else
-        {
-          isXgmiSrc = true;
-          numHopsSrc = 0;
-        }
-
-        // Skip this SRC if it is not XGMI but only XGMI links may be used
-        if (useXgmiOnly && !isXgmiSrc) continue;
-
-        // Skip this SRC if XGMI distance is already past limit
-        if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
-      }
-      else if (srcList[j].first != MEM_NULL && useXgmiOnly) continue;
-
-      tinfo.srcType  = srcList[j].first;
-      tinfo.srcIndex = srcList[j].second;
-
-      bool isXgmiDst = false;
-      int  numHopsDst = 0;
-      for (int k = 0; k < numDsts; ++k)
-      {
-        if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
-        {
-          if (exeList[i].second != dstList[k].second)
-          {
-#if defined(__NVCC__)
-            isXgmiSrc = false;
-#else
-            uint32_t exeToDstLinkType, exeToDstHopCount;
-            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
-                                                  RemappedIndex(dstList[k].second, false),
-                                                  &exeToDstLinkType,
-                                                  &exeToDstHopCount));
-            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-            if (isXgmiDst) numHopsDst = exeToDstHopCount;
-#endif
-          }
-          else
-          {
-            isXgmiDst = true;
-            numHopsDst = 0;
-          }
-        }
-
-        // Skip this DST if it is not XGMI but only XGMI links may be used
-        if (dstList[k].first != MEM_NULL && useXgmiOnly && !isXgmiDst) continue;
-
-        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
-        if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
-
-        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
-        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
-
-#if defined(__NVCC__)
-        // Skip CPU executors on GPU memory on NVIDIA platform
-        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
-          continue;
-#endif
-
-        tinfo.dstType  = dstList[k].first;
-        tinfo.dstIndex = dstList[k].second;
-
-        // Skip if there is no src and dst
-        if (tinfo.srcType == MEM_NULL && tinfo.dstType == MEM_NULL) continue;
-
-        possibleTransfers.push_back(tinfo);
-      }
-    }
-  }
-
-  int const numPossible = (int)possibleTransfers.size();
-  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
-
-  if (ev.sweepMin > numPossible)
-  {
-    printf("No valid test configurations exist\n");
-    return;
-  }
-
-  if (ev.outputToCsv)
-  {
-    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
-           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
-  }
-
-  int numTestsRun = 0;
-  int M = ev.sweepMin;
-  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
-  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
-
-  // Log sweep to configuration file
-  FILE *fp = fopen("lastSweep.cfg", "w");
-  if (!fp)
-  {
-    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
-    exit(1);
-  }
-
-  // Create bitmask of numPossible triplets, of which M will be chosen
-  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
-  auto cpuStart = std::chrono::high_resolution_clock::now();
-  while (1)
-  {
-    if (isRandom)
-    {
-      // Pick random number of simultaneous transfers to execute
-      // NOTE: This currently skews distribution due to some #s having more possibilities than others
-      M = distribution(*ev.generator);
-
-      // Generate a random bitmask
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-      std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
-    }
-
-    // Convert bitmask to list of Transfers
-    std::vector<Transfer> transfers;
-    for (int value = 0; value < numPossible; ++value)
-    {
-      if (bitmask[value])
-      {
-        // Convert integer value to (SRC->EXE->DST) triplet
-        Transfer transfer;
-        transfer.numSrcs        = (possibleTransfers[value].srcType == MEM_NULL ? 0 : 1);
-        transfer.numDsts        = (possibleTransfers[value].dstType == MEM_NULL ? 0 : 1);
-        transfer.srcType        = {possibleTransfers[value].srcType};
-        transfer.srcIndex       = {possibleTransfers[value].srcIndex};
-        transfer.exeType        = possibleTransfers[value].exeType;
-        transfer.exeIndex       = possibleTransfers[value].exeIndex;
-        transfer.exeSubIndex    = -1;
-        transfer.dstType        = {possibleTransfers[value].dstType};
-        transfer.dstIndex       = {possibleTransfers[value].dstIndex};
-        transfer.numSubExecs    = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
-        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
-        transfers.push_back(transfer);
-      }
-    }
-
-    LogTransfers(fp, ++numTestsRun, transfers);
-    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
-
-    // Check for test limit
-    if (numTestsRun == ev.sweepTestLimit)
-    {
-      printf("Test limit reached\n");
-      break;
-    }
-
-    // Check for time limit
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
-    {
-      printf("Time limit exceeded\n");
-      break;
-    }
-
-    // Increment bitmask if not random sweep
-    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
-    {
-      M++;
-      // Check for completion
-      if (M > maxParallelTransfers)
-      {
-        printf("Sweep complete\n");
-        break;
-      }
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-    }
-  }
-  fclose(fp);
-}
-
-void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
-{
-  fprintf(fp, "# Test %d\n", testNum);
-  fprintf(fp, "%d", -1 * (int)transfers.size());
-  for (auto const& transfer : transfers)
-  {
-    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
-            MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
-            ExeTypeStr[transfer.exeType],    transfer.exeIndex,
-            MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
-            transfer.numSubExecs,
-            transfer.numBytes);
-  }
-  fprintf(fp, "\n");
-  fflush(fp);
-}
-
-std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
-{
-  std::stringstream ss;
-  for (int i = 0; i < strVector.size(); ++i)
-  {
-    if (i) ss << " ";
-    ss << (strVector[i] + initOffset);
-  }
-  return ss.str();
-}
-
-void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results)
-{
-  char sep = ev.outputToCsv ? ',' : '|';
-  size_t numTimedIterations = results.numTimedIterations;
-
-  // Loop over each executor
-  for (auto exeInfoPair : results.exeResults) {
-    ExeResult const& exeResult = exeInfoPair.second;
-    ExeType exeType  = exeInfoPair.first.first;
-    int     exeIndex = exeInfoPair.first.second;
-
-    printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
-           ExeTypeName[exeType], exeIndex, sep, exeResult.bandwidthGbs, sep,
-           exeResult.durationMsec, sep, exeResult.totalBytes, sep, exeResult.sumBandwidthGbs);
-
-    for (int idx : exeResult.transferIdx) {
-      Transfer const& t = transfers[idx];
-
-      char exeSubIndexStr[32] = "";
-      if (ev.useXccFilter || t.exeType == EXE_GPU_DMA) {
-        if (t.exeSubIndex == -1)
-          sprintf(exeSubIndexStr, ".*");
-        else
-          sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
-      }
-      printf("     Transfer %02d  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
-             t.transferIndex, sep,
-             t.transferBandwidth, sep,
-             t.transferTime, sep,
-             t.numBytesActual, sep,
-             t.SrcToStr().c_str(),
-             ExeTypeName[t.exeType], t.exeIndex,
-             exeSubIndexStr,
-             t.numSubExecs,
-             t.DstToStr().c_str());
-
-      // Show per-iteration timing information
-      if (ev.showIterations) {
-
-        std::set<std::pair<double, int>> times;
-        double stdDevTime = 0;
-        double stdDevBw = 0;
-        for (int i = 0; i < numTimedIterations; i++) {
-          times.insert(std::make_pair(t.perIterationTime[i], i+1));
-          double const varTime = fabs(t.transferTime - t.perIterationTime[i]);
-          stdDevTime += varTime * varTime;
-
-          double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / t.perIterationTime[i] * 1000.0f;
-          double const varBw = fabs(iterBandwidthGbs - t.transferBandwidth);
-          stdDevBw += varBw * varBw;
-        }
-        stdDevTime = sqrt(stdDevTime / numTimedIterations);
-        stdDevBw = sqrt(stdDevBw / numTimedIterations);
-
-        for (auto time : times) {
-          double iterDurationMsec = time.first;
-          double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
-          printf("      Iter %03d    %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
-
-          std::set<int> usedXccs;
-          if (time.second - 1 < t.perIterationCUs.size()) {
-            printf(" CUs:");
-            for (auto x : t.perIterationCUs[time.second - 1]) {
-              printf(" %02d:%02d", x.first, x.second);
-              usedXccs.insert(x.first);
-            }
-          }
-          printf(" XCCs:");
-          for (auto x : usedXccs)
-            printf(" %02d", x);
-          printf("\n");
-        }
-        printf("      StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
-      }
-    }
-  }
-
-  printf(" Aggregate (CPU)  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n", sep,
-         results.totalBandwidthCpu, sep, results.totalDurationMsec, sep, results.totalBytesTransferred, sep, results.overheadMsec);
-}
-
-void RunHealthCheck(EnvVars ev)
-{
-  // Check for supported platforms
-#if defined(__NVCC__)
-  printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
-  return;
-#else
-
-  bool hasFail = false;
-
-  // Force use of single stream
-  ev.useSingleStream = 1;
-
-  for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
-    std::string fullName = prop.gcnArchName;
-    std::string archName = fullName.substr(0, fullName.find(':'));
-    if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
-    {
-      printf("[WARN] healthcheck preset is currently only supported on MI300 series hardware\n");
-      exit(1);
-    }
-  }
-
-  // Pass limits
-  double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
-  double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
-  double a2aLimit  = getenv("LIMIT_A2A")  ? atof(getenv("LIMIT_A2A"))  : (int)(45 * 0.95);
-
-  // Run CPU to GPU
-
-  // Run unidirectional read from CPU to GPU
-  printf("Testing unidirectional reads from CPU ");
-  {
-    std::vector<std::pair<int, double>> fails;
-    for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
-      std::vector<Transfer> transfers(1);
-      Transfer& t = transfers[0];
-      t.exeType     = EXE_GPU_GFX;
-      t.exeIndex    = gpuId;
-      t.numBytes    = 64*1024*1024;
-      t.numBytesActual = 64*1024*1024;
-      t.numSrcs     = 1;
-      t.srcType.push_back(MEM_CPU);
-      t.srcIndex.push_back(GetClosestNumaNode(gpuId));
-      t.numDsts     = 0;
-      t.dstType.clear();
-      t.dstIndex.clear();
-
-    // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t.numSubExecs = cu;
-        TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
-        bestResult = std::max(bestResult, t.transferBandwidth);
-        if (t.transferBandwidth >= udirLimit) {
-          passed = true;
-          break;
-      }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
-      }
-    }
-  }
-
-  // Run unidirectional write from GPU to CPU
-  printf("Testing unidirectional writes to  CPU ");
-  {
-    std::vector<std::pair<int, double>> fails;
-    for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
-      std::vector<Transfer> transfers(1);
-      Transfer& t = transfers[0];
-      t.exeType     = EXE_GPU_GFX;
-      t.exeIndex    = gpuId;
-      t.numBytes    = 64*1024*1024;
-      t.numBytesActual = 64*1024*1024;
-      t.numDsts     = 1;
-      t.dstType.push_back(MEM_CPU);
-      t.dstIndex.push_back(GetClosestNumaNode(gpuId));
-      t.numSrcs     = 0;
-      t.srcType.clear();
-      t.srcIndex.clear();
-
-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t.numSubExecs = cu;
-
-        TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
-        bestResult = std::max(bestResult, t.transferBandwidth);
-        if (t.transferBandwidth >= udirLimit) {
-          passed = true;
-          break;
-        }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
-      }
-    }
-  }
-
-  // Run bidirectional tests
-  printf("Testing bidirectional  reads + writes ");
-  {
-    std::vector<std::pair<int, double>> fails;
-    for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
-      std::vector<Transfer> transfers(2);
-      Transfer& t0 = transfers[0];
-      Transfer& t1 = transfers[1];
-
-      t0.exeType     = EXE_GPU_GFX;
-      t0.exeIndex    = gpuId;
-      t0.numBytes    = 64*1024*1024;
-      t0.numBytesActual = 64*1024*1024;
-      t0.numSrcs     = 1;
-      t0.srcType.push_back(MEM_CPU);
-      t0.srcIndex.push_back(GetClosestNumaNode(gpuId));
-      t0.numDsts     = 0;
-      t0.dstType.clear();
-      t0.dstIndex.clear();
-
-      t1.exeType     = EXE_GPU_GFX;
-      t1.exeIndex    = gpuId;
-      t1.numBytes    = 64*1024*1024;
-      t1.numBytesActual = 64*1024*1024;
-      t1.numDsts     = 1;
-      t1.dstType.push_back(MEM_CPU);
-      t1.dstIndex.push_back(GetClosestNumaNode(gpuId));
-      t1.numSrcs     = 0;
-      t1.srcType.clear();
-      t1.srcIndex.clear();
-
-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t0.numSubExecs = cu;
-        t1.numSubExecs = cu;
-
-        TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
-        double sum = t0.transferBandwidth + t1.transferBandwidth;
-        bestResult = std::max(bestResult, sum);
-        if (sum >= bdirLimit) {
-          passed = true;
-          break;
-        }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
-      }
-    }
-  }
-
-  // Run XGMI tests:
-  printf("Testing all-to-all XGMI copies        "); fflush(stdout);
-  {
-    ev.gfxUnroll = 2;
-    std::vector<Transfer> transfers;
-    for (int i = 0; i < ev.numGpuDevices; i++) {
-      for (int j = 0; j < ev.numGpuDevices; j++) {
-        if (i == j) continue;
-        Transfer t;
-        t.exeType = EXE_GPU_GFX;
-        t.exeIndex = i;
-        t.numBytes = t.numBytesActual = 64*1024*1024;
-        t.numSrcs = 1;
-        t.numDsts = 1;
-        t.numSubExecs = 8;
-        t.srcType.push_back(MEM_GPU_FINE);
-        t.dstType.push_back(MEM_GPU_FINE);
-        t.srcIndex.push_back(i);
-        t.dstIndex.push_back(j);
-        transfers.push_back(t);
-      }
-    }
-    TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
-    std::vector<std::pair<std::pair<int,int>, double>> fails;
-    int transferIdx = 0;
-    for (int i = 0; i < ev.numGpuDevices; i++) {
-      printf("."); fflush(stdout);
-      for (int j = 0; j < ev.numGpuDevices; j++) {
-        if (i == j) continue;
-        Transfer const& t = transfers[transferIdx];
-        if (t.transferBandwidth < a2aLimit) {
-          fails.push_back(std::make_pair(std::make_pair(i,j), t.transferBandwidth));
-        }
-        transferIdx++;
-      }
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
-      }
-    }
-  }
-
-  exit(hasFail ? 1 : 0);
-#endif
-}
--- a/src/client/Client.cpp
+++ b/src/client/Client.cpp
+/*
+Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "Client.hpp"
+#include "Presets.hpp"
+#include "Topology.hpp"
+#include <fstream>
+
+int main(int argc, char **argv) {
+
+  // Collect environment variables
+  EnvVars ev;
+
+  // Display usage instructions and detected topology
+  if (argc <= 1) {
+    if (!ev.outputToCsv) {
+      DisplayUsage(argv[0]);
+      DisplayPresets();
+    }
+    DisplayTopology(ev.outputToCsv);
+    exit(0);
+  }
+
+  // Determine number of bytes to run per Transfer
+  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
+  if (argc > 2) {
+    // Adjust bytes if unit specified
+    char units = argv[2][strlen(argv[2])-1];
+    switch (units) {
+    case 'G': case 'g': numBytesPerTransfer *= 1024;
+    case 'M': case 'm': numBytesPerTransfer *= 1024;
+    case 'K': case 'k': numBytesPerTransfer *= 1024;
+    }
+  }
+  if (numBytesPerTransfer % 4) {
+    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
+    exit(1);
+  }
+
+  // Run preset benchmark if requested
+  if (RunPreset(ev, numBytesPerTransfer, argc, argv)) exit(0);
+
+  // Read input from command line or configuration file
+  std::vector<std::string> lines;
+  {
+    std::string line;
+    if (!strcmp(argv[1], "cmdline")) {
+      for (int i = 3; i < argc; i++)
+        line += std::string(argv[i]) + " ";
+      lines.push_back(line);
+    } else {
+      std::ifstream cfgFile(argv[1]);
+      if (!cfgFile.is_open()) {
+        printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
+        exit(1);
+      }
+      while (std::getline(cfgFile, line))
+        lines.push_back(line);
+      cfgFile.close();
+    }
+  }
+
+  // Print environment variables and CSV header
+  ev.DisplayEnvVars();
+  if (ev.outputToCsv)
+    printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
+
+  TransferBench::ConfigOptions cfgOptions = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  std::vector<ErrResult> errors;
+
+  // Process each line as a Test
+  int testNum = 0;
+  for (std::string const &line : lines) {
+    // Check if line is a comment to be echoed to output (starts with ##)
+    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s\n", line.c_str());
+
+    // Parse set of parallel Transfers to execute
+    std::vector<Transfer> transfers;
+    CheckForError(TransferBench::ParseTransfers(line, transfers));
+    if (transfers.empty()) continue;
+
+    // Check for variable sub-executors Transfers
+    int numVariableTransfers = 0;
+    int maxVarCount = 0;
+    {
+      std::map<ExeDevice, int> varTransferCount;
+      for (auto const& t : transfers) {
+        if (t.numSubExecs == 0) {
+          if (t.exeDevice.exeType != EXE_GPU_GFX) {
+            printf("[ERROR] Variable number of subexecutors is only supported on GFX executors\n");
+            exit(1);
+          }
+          numVariableTransfers++;
+          varTransferCount[t.exeDevice]++;
+          maxVarCount = max(maxVarCount, varTransferCount[t.exeDevice]);
+        }
+      }
+      if (numVariableTransfers > 0 && numVariableTransfers != transfers.size()) {
+        printf("[ERROR] All or none of the Transfers in the Test must use variable number of Subexecutors\n");
+        exit(1);
+      }
+    }
+
+    // Run the specified numbers of bytes otherwise generate a range of values
+    for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
+      size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
+      size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
+      do {
+        for (auto& t : transfers)
+          t.numBytes = currBytes;
+
+        if (maxVarCount == 0) {
+          if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
+            PrintResults(ev, ++testNum, transfers, results);
+          }
+          PrintErrors(results.errResults);
+        } else {
+          // Variable subexecutors - Determine how many subexecutors to sweep up to
+          int maxNumVarSubExec = ev.maxNumVarSubExec;
+          if (maxNumVarSubExec == 0) {
+            maxNumVarSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}) / maxVarCount;
+          }
+
+          TransferBench::TestResults bestResults;
+          std::vector<Transfer> bestTransfers;
+          for (int numSubExecs = ev.minNumVarSubExec; numSubExecs <= maxNumVarSubExec; numSubExecs++) {
+            std::vector<Transfer> tempTransfers = transfers;
+            for (auto& t : tempTransfers) {
+              if (t.numSubExecs == 0) t.numSubExecs = numSubExecs;
+            }
+
+            TransferBench::TestResults tempResults;
+            if (!TransferBench::RunTransfers(cfgOptions, tempTransfers, tempResults)) {
+              PrintErrors(tempResults.errResults);
+            } else {
+              if (tempResults.avgTotalBandwidthGbPerSec > bestResults.avgTotalBandwidthGbPerSec) {
+                bestResults = tempResults;
+                bestTransfers = tempTransfers;
+              }
+            }
+          }
+          PrintResults(ev, ++testNum, bestTransfers, bestResults);
+          PrintErrors(bestResults.errResults);
+        }
+        if (numBytesPerTransfer != 0) break;
+        currBytes += deltaBytes;
+      } while (currBytes < bytes * 2);
+      if (numBytesPerTransfer != 0) break;
+    }
+  }
+}
+
+void DisplayUsage(char const* cmdName)
+{
+  printf("TransferBench Client v%s (Backend v%s)\n", CLIENT_VERSION, TransferBench::VERSION);
+  printf("========================================\n");
+
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+
+  printf("Usage: %s config <N>\n", cmdName);
+  printf("  config: Either:\n");
+  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
+  printf("          - Name of preset config:\n");
+  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
+  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
+         DEFAULT_BYTES_PER_TRANSFER);
+  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
+  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
+  printf("\n");
+
+  EnvVars::DisplayUsage();
+}
+
+std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices) {
+  if (memDevices.empty()) return "N";
+  std::stringstream ss;
+  for (auto const& m : memDevices)
+    ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
+  return ss.str();
+}
+
+void PrintResults(EnvVars const& ev, int const testNum,
+                  std::vector<Transfer> const& transfers,
+                  TransferBench::TestResults const& results)
+{
+  char sep = ev.outputToCsv ? ',' : '|';
+  size_t numTimedIterations = results.numTimedIterations;
+
+  if (!ev.outputToCsv) printf("Test %d:\n", testNum);
+
+  // Loop over each executor
+  for (auto exeInfoPair : results.exeResults) {
+    ExeDevice const& exeDevice = exeInfoPair.first;
+    ExeResult const& exeResult = exeInfoPair.second;
+    ExeType const    exeType   = exeDevice.exeType;
+    int32_t const    exeIndex  = exeDevice.exeIndex;
+
+    printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
+           ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
+           exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);
+
+    // Loop over each executor
+    for (int idx : exeResult.transferIdx) {
+      Transfer const& t = transfers[idx];
+      TransferResult const& r = results.tfrResults[idx];
+
+      char exeSubIndexStr[32] = "";
+      if (t.exeSubIndex != -1)
+        sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
+
+      printf("     Transfer %02d  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
+             idx,                    sep,
+             r.avgBandwidthGbPerSec, sep,
+             r.avgDurationMsec,      sep,
+             r.numBytes,             sep,
+             MemDevicesToStr(t.srcs).c_str(), ExeTypeName[exeType], exeIndex,
+             exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
+
+      // Show per-iteration timing information
+      if (ev.showIterations) {
+
+        // Check that per-iteration information exists
+        if (r.perIterMsec.size() != numTimedIterations) {
+          printf("[ERROR] Per iteration timing data unavailable: Expected %lu data points, but have %lu\n",
+                 numTimedIterations, r.perIterMsec.size());
+          exit(1);
+        }
+
+        // Compute standard deviation and track iterations by speed
+        std::set<std::pair<double, int>> times;
+        double stdDevTime = 0;
+        double stdDevBw = 0;
+        for (int i = 0; i < numTimedIterations; i++) {
+          times.insert(std::make_pair(r.perIterMsec[i], i+1));
+          double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
+          stdDevTime += varTime * varTime;
+
+          double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
+          double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
+          stdDevBw += varBw * varBw;
+        }
+        stdDevTime = sqrt(stdDevTime / numTimedIterations);
+        stdDevBw = sqrt(stdDevBw / numTimedIterations);
+
+        // Loop over iterations (fastest to slowest)
+        for (auto& time : times) {
+          double iterDurationMsec = time.first;
+          double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
+          printf("      Iter %03d    %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
+
+          std::set<int> usedXccs;
+          if (time.second - 1 < r.perIterCUs.size()) {
+            printf(" CUs:");
+            for (auto x : r.perIterCUs[time.second - 1]) {
+              printf(" %02d:%02d", x.first, x.second);
+              usedXccs.insert(x.first);
+            }
+          }
+
+          printf(" XCCs:");
+          for (auto x : usedXccs)
+            printf(" %02d", x);
+          printf("\n");
+        }
+        printf("      StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
+      }
+    }
+  }
+  printf(" Aggregate (CPU)  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
+         sep, results.avgTotalBandwidthGbPerSec,
+         sep, results.avgTotalDurationMsec,
+         sep, results.totalBytesTransferred,
+         sep, results.overheadMsec);
+}
+
+void CheckForError(ErrResult const& error)
+{
+  switch (error.errType) {
+  case ERR_NONE: return;
+  case ERR_WARN:
+    printf("[WARN] %s\n", error.errMsg.c_str());
+    return;
+  case ERR_FATAL:
+    printf("[ERROR] %s\n", error.errMsg.c_str());
+    exit(1);
+  default:
+    break;
+  }
+}
+
+void PrintErrors(std::vector<ErrResult> const& errors)
+{
+  bool isFatal = false;
+  for (auto const& err : errors) {
+    printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
+    isFatal |= (err.errType == ERR_FATAL);
+  }
+  if (isFatal) exit(1);
+}
--- a/src/client/Client.hpp
+++ b/src/client/Client.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// TransferBench client version
+#define CLIENT_VERSION "1.54.00"
+
+#include "TransferBench.hpp"
+#include "EnvVars.hpp"
+
+size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);
+
+char const ExeTypeName[4][4] = {"CPU", "GPU", "DMA", "IBV"};
+
+// Display detected hardware
+void DisplayTopology(bool outputToCsv);
+
+// Display usage instructions
+void DisplayUsage(char const* cmdName);
+
+// Print TransferBench test results
+void PrintResults(EnvVars const& ev, int const testNum,
+                  std::vector<Transfer> const& transfers,
+                  TransferBench::TestResults const& results);
+
+// Helper function that converts MemDevices to a string
+std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices);
+
+// Helper function to print warning / exit on fatal error
+void CheckForError(ErrResult const& error);
+
+// Helper function to print list of errors
+void PrintErrors(std::vector<ErrResult> const& errors);
--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
+/*
+Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ENVVARS_HPP
+#define ENVVARS_HPP
+
+// Helper macro for catching HIP errors
+#define HIP_CALL(cmd)                                                           \
+  do {                                                                          \
+    hipError_t error = (cmd);                                                   \
+    if (error != hipSuccess) {                                                  \
+      std::cerr << "Encountered HIP error (" << hipGetErrorString(error)        \
+                << ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
+      exit(-1);                                                                 \
+    }                                                                           \
+  } while (0)
+
+#include <algorithm>
+#include <iostream>
+#include <numa.h>
+#include <random>
+#include <time.h>
+#include "Client.hpp"
+
+#include "TransferBench.hpp"
+using namespace TransferBench;
+
+// Redefinitions for CUDA compatibility
+//==========================================================================================
+#if defined(__NVCC__)
+  #define hipError_t                                         cudaError_t
+  #define hipGetErrorString                                  cudaGetErrorString
+  #define hipDeviceProp_t                                    cudaDeviceProp
+  #define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
+  #define hipGetDeviceProperties                             cudaGetDeviceProperties
+  #define hipSuccess                                         cudaSuccess
+  #define gcnArchName                                        name
+  #define hipGetDeviceCount                                  cudaGetDeviceCount
+#endif
+
+// This class manages environment variable that affect TransferBench
+class EnvVars
+{
+public:
+  // Default configuration values
+  int const DEFAULT_SAMPLING_FACTOR = 1;
+
+  // Environment variables
+  // General options
+  int numIterations;                 // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
+  int numSubIterations;              // Number of subiterations to perform
+  int numWarmups;                    // Number of un-timed warmup iterations to perform
+  int showIterations;                // Show per-iteration timing info
+  int useInteractive;                // Pause for user-input before starting transfer loop
+
+  // Data options
+  int alwaysValidate;                // Validate after each iteration instead of once after all iterations
+  int blockBytes;                    // Each subexecutor, except the last, gets a multiple of this many bytes to copy
+  int byteOffset;                    // Byte-offset for memory allocations
+  vector<float> fillPattern;         // Pattern of floats used to fill source data
+  int validateDirect;                // Validate GPU destination memory directly instead of staging GPU memory on host
+  int validateSource;                // Validate source GPU memory immediately after preparation
+
+  // DMA options
+  int useHsaDma;                     // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
+
+  // GFX options
+  int gfxBlockSize;                  // Size of each threadblock (must be multiple of 64)
+  vector<uint32_t> cuMask;           // Bit-vector representing the CU mask
+  vector<vector<int>> prefXccTable;  // Specifies XCC to use for given exe->dst pair
+  int gfxUnroll;                     // GFX-kernel unroll factor
+  int useHipEvents;                  // Use HIP events for timing GFX/DMA Executor
+  int useSingleStream;               // Use a single stream per GPU GFX executor instead of stream per Transfer
+  int gfxSingleTeam;                 // Team all subExecutors across the data array
+  int gfxWaveOrder;                  // GFX-kernel wavefront ordering
+
+  // Client options
+  int hideEnv;                       // Skip printing environment variable
+  int minNumVarSubExec;              // Minimum # of subexecutors to use for variable subExec Transfers
+  int maxNumVarSubExec;              // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
+  int outputToCsv;                   // Output in CSV format
+  int samplingFactor;                // Affects how many different values of N are generated (when N set to 0)
+
+  // Developer features
+  int gpuMaxHwQueues;                // Tracks GPU_MAX_HW_QUEUES environment variable
+
+  // Constructor that collects values
+  EnvVars()
+  {
+    int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    int numDeviceCUs    = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0});
+
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, 0));
+    std::string fullName = prop.gcnArchName;
+    std::string archName = fullName.substr(0, fullName.find(':'));
+
+    // Different hardware pick different GPU kernels
+    // This performance difference is generally only noticable when executing fewer CUs
+    int defaultGfxUnroll = 4;
+    if      (archName == "gfx906") defaultGfxUnroll = 8;
+    else if (archName == "gfx90a") defaultGfxUnroll = 8;
+    else if (archName == "gfx940") defaultGfxUnroll = 6;
+    else if (archName == "gfx941") defaultGfxUnroll = 6;
+    else if (archName == "gfx942") defaultGfxUnroll = 4;
+
+    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
+    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
+    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
+    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
+    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
+    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
+    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
+    minNumVarSubExec  = GetEnvVar("MIN_VAR_SUBEXEC"     , 1);
+    maxNumVarSubExec  = GetEnvVar("MAX_VAR_SUBEXEC"     , 0);
+    numIterations     = GetEnvVar("NUM_ITERATIONS"      , 10);
+    numSubIterations  = GetEnvVar("NUM_SUBITERATIONS"   , 1);
+    numWarmups        = GetEnvVar("NUM_WARMUPS"         , 3);
+    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
+    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , 1);
+    showIterations    = GetEnvVar("SHOW_ITERATIONS"     , 0);
+    useHipEvents      = GetEnvVar("USE_HIP_EVENTS"      , 1);
+    useHsaDma         = GetEnvVar("USE_HSA_DMA"         , 0);
+    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
+    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 1);
+    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
+    validateSource    = GetEnvVar("VALIDATE_SOURCE"     , 0);
+
+    gpuMaxHwQueues    = GetEnvVar("GPU_MAX_HW_QUEUES"   , 4);
+
+    // Check for fill pattern
+    char* pattern = getenv("FILL_PATTERN");
+    if (pattern != NULL) {
+      int patternLen = strlen(pattern);
+      if (patternLen % 2) {
+        printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
+        exit(1);
+      }
+
+      // Read in bytes
+      std::vector<unsigned char> bytes;
+      unsigned char val = 0;
+      for (int i = 0; i < patternLen; i++) {
+        if ('0' <= pattern[i] && pattern[i] <= '9')
+          val += (pattern[i] - '0');
+        else if ('A' <= pattern[i] && pattern[i] <= 'F')
+          val += (pattern[i] - 'A' + 10);
+        else if ('a' <= pattern[i] && pattern[i] <= 'f')
+          val += (pattern[i] - 'a' + 10);
+        else {
+          printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F).  (not %c)\n", pattern[i]);
+          exit(1);
+        }
+
+        if (i % 2 == 0)
+          val <<= 4;
+        else {
+          bytes.push_back(val);
+          val = 0;
+        }
+      }
+
+      // Reverse bytes (input is assumed to be given in big-endian)
+      std::reverse(bytes.begin(), bytes.end());
+
+      // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
+      int copies;
+      switch (patternLen % 8) {
+      case 0:  copies = 1; break;
+      case 4:  copies = 2; break;
+      default: copies = 4; break;
+      }
+
+      // Fill floats
+      int numFloats = copies * patternLen / 8;
+      fillPattern.resize(numFloats);
+      unsigned char* rawData = (unsigned char*) fillPattern.data();
+      for (int i = 0; i < numFloats * 4; i++)
+        rawData[i] = bytes[i % bytes.size()];
+    }
+    else fillPattern.clear();
+
+    // Check for CU mask
+    int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
+    cuMask.clear();
+    char* cuMaskStr = getenv("CU_MASK");
+    if (cuMaskStr != NULL) {
+#if defined(__NVCC__)
+      printf("[WARN] CU_MASK is not supported in CUDA\n");
+#else
+      std::vector<std::pair<int, int>> ranges;
+      int maxCU = 0;
+      char* token = strtok(cuMaskStr, ",");
+      while (token) {
+        int start, end;
+        if (sscanf(token, "%d-%d", &start, &end) == 2) {
+          ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
+          maxCU = std::max(maxCU, std::max(start, end));
+        } else if (sscanf(token, "%d", &start) == 1) {
+          ranges.push_back(std::make_pair(start, start));
+          maxCU = std::max(maxCU, start);
+        } else {
+          printf("[ERROR] Unrecognized token [%s]\n", token);
+          exit(1);
+        }
+        token = strtok(NULL, ",");
+      }
+      cuMask.resize(2 * numXccs, 0);
+
+      for (auto range : ranges) {
+        for (int i = range.first; i <= range.second; i++) {
+          for (int x = 0; x < numXccs; x++) {
+            int targetBit = i * numXccs + x;
+            cuMask[targetBit/32] |= (1<<(targetBit%32));
+          }
+        }
+      }
+#endif
+    }
+
+    // Parse preferred XCC table (if provided)
+    char* prefXccStr = getenv("XCC_PREF_TABLE");
+    if (prefXccStr) {
+      prefXccTable.resize(numDetectedGpus);
+      for (int i = 0; i < numDetectedGpus; i++){
+        prefXccTable[i].resize(numDetectedGpus, -1);
+      }
+      char* token = strtok(prefXccStr, ",");
+      int tokenCount = 0;
+      while (token) {
+        int xccId;
+        if (sscanf(token, "%d", &xccId) == 1) {
+          int src = tokenCount / numDetectedGpus;
+          int dst = tokenCount % numDetectedGpus;
+          if (xccId < 0 || xccId >= numXccs) {
+            printf("[ERROR] XCC index (%d) out of bounds. Expect value less than %d\n", xccId, numXccs);
+            exit(1);
+          }
+          prefXccTable[src][dst] = xccId;
+
+          tokenCount++;
+          if (tokenCount == (numDetectedGpus * numDetectedGpus)) break;
+        } else {
+          printf("[ERROR] Unrecognized token [%s]\n", token);
+          exit(1);
+        }
+        token = strtok(NULL, ",");
+      }
+    }
+  }
+
+  // Display info on the env vars that can be used
+  static void DisplayUsage()
+  {
+    printf("Environment variables:\n");
+    printf("======================\n");
+    printf(" ALWAYS_VALIDATE   - Validate after each iteration instead of once after all iterations\n");
+    printf(" BLOCK_SIZE        - # of threads per threadblock (Must be multiple of 64)\n");
+    printf(" BLOCK_BYTES       - Controls granularity of how work is divided across subExecutors\n");
+    printf(" BYTE_OFFSET       - Initial byte-offset for memory allocations.  Must be multiple of 4\n");
+    printf(" CU_MASK           - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
+    printf(" FILL_PATTERN      - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
+    printf(" GFX_UNROLL        - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
+    printf(" GFX_SINGLE_TEAM   - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
+    printf(" GFX_WAVE_ORDER    - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
+    printf(" HIDE_ENV          - Hide environment variable value listing\n");
+    printf(" MIN_VAR_SUBEXEC   - Minumum # of subexecutors to use for variable subExec Transfers\n");
+    printf(" MAX_VAR_SUBEXEC   - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
+    printf(" NUM_ITERATIONS    - # of timed iterations per test. If negative, run for this many seconds instead\n");
+    printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
+    printf(" NUM_WARMUPS       - # of untimed warmup iterations per test\n");
+    printf(" OUTPUT_TO_CSV     - Outputs to CSV format if set\n");
+    printf(" SAMPLING_FACTOR   - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" SHOW_ITERATIONS   - Show per-iteration timing info\n");
+    printf(" USE_HIP_EVENTS    - Use HIP events for GFX executor timing\n");
+    printf(" USE_HSA_DMA       - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
+    printf(" USE_INTERACTIVE   - Pause for user-input before starting transfer loop\n");
+    printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
+    printf(" VALIDATE_DIRECT   - Validate GPU destination memory directly instead of staging GPU memory on host\n");
+    printf(" VALIDATE_SOURCE   - Validate GPU src memory immediately after preparation\n");
+  }
+
+  void Print(std::string const& name, int32_t const value, const char* format, ...) const
+  {
+    printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : ");
+    va_list args;
+    va_start(args, format);
+    vprintf(format, args);
+    va_end(args);
+    printf("\n");
+  }
+
+  void Print(std::string const& name, std::string const& value, const char* format, ...) const
+  {
+    printf("%-20s%s%12s%s", name.c_str(), outputToCsv ? "," : " = ", value.c_str(), outputToCsv ? "," : " : ");
+    va_list args;
+    va_start(args, format);
+    vprintf(format, args);
+    va_end(args);
+    printf("\n");
+  }
+
+  // Display env var settings
+  void DisplayEnvVars() const
+  {
+    int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+    if (!outputToCsv) {
+      printf("TransferBench Client v%s Backend v%s\n", CLIENT_VERSION, TransferBench::VERSION);
+      printf("===============================================================\n");
+      if (!hideEnv) printf("[Common]                              (Suppress by setting HIDE_ENV=1)\n");
+    }
+    else if (!hideEnv)
+      printf("EnvVar,Value,Description,(TransferBench Client v%s Backend v%s)\n", CLIENT_VERSION, TransferBench::VERSION);
+    if (hideEnv) return;
+
+    Print("ALWAYS_VALIDATE", alwaysValidate,
+          "Validating after %s", (alwaysValidate ? "each iteration" : "all iterations"));
+    Print("BLOCK_BYTES", blockBytes,
+          "Each CU gets a mulitple of %d bytes to copy", blockBytes);
+    Print("BYTE_OFFSET", byteOffset,
+          "Using byte offset of %d", byteOffset);
+    Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
+          "%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
+    Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
+          "%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
+    Print("GFX_BLOCK_SIZE", gfxBlockSize,
+          "Threadblock size of %d", gfxBlockSize);
+    Print("GFX_SINGLE_TEAM", gfxSingleTeam,
+          "%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
+                                 "Each CUs operates on its own disjoint subarray"));
+    Print("GFX_UNROLL", gfxUnroll,
+          "Using GFX unroll factor of %d", gfxUnroll);
+    Print("GFX_WAVE_ORDER", gfxWaveOrder,
+          "Using GFX wave ordering of %s", (gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
+                                            gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
+                                            gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
+                                            gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
+                                            gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
+                                                                "CU,Wavefront,Unroll"));
+    Print("MIN_VAR_SUBEXEC", minNumVarSubExec,
+          "Using at least %d subexecutor(s) for variable subExec tranfers", minNumVarSubExec);
+    Print("MAX_VAR_SUBEXEC", maxNumVarSubExec,
+          "Using up to %s subexecutors for variable subExec transfers",
+          maxNumVarSubExec ? std::to_string(maxNumVarSubExec).c_str() : "all available");
+    Print("NUM_ITERATIONS", numIterations,
+          (numIterations == 0) ? "Running infinitely" :
+          "Running %d %s", abs(numIterations), (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
+    Print("NUM_SUBITERATIONS", numSubIterations,
+          "Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str());
+    Print("NUM_WARMUPS", numWarmups,
+          "Running %d warmup iteration(s) per Test", numWarmups);
+    Print("SHOW_ITERATIONS", showIterations,
+          "%s per-iteration timing", showIterations ? "Showing" : "Hiding");
+    Print("USE_HIP_EVENTS", useHipEvents,
+          "Using %s for GFX/DMA Executor timing", useHipEvents ? "HIP events" : "CPU wall time");
+    Print("USE_HSA_DMA", useHsaDma,
+          "Using %s for DMA execution", useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync");
+    Print("USE_INTERACTIVE", useInteractive,
+          "Running in %s mode", useInteractive ? "interactive" : "non-interactive");
+    Print("USE_SINGLE_STREAM", useSingleStream,
+          "Using single stream per GFX %s", useSingleStream ? "device" : "Transfer");
+
+    if (getenv("XCC_PREF_TABLE")) {
+      printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
+      printf("%36s:         ", "");
+      for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
+      for (int i = 0; i < numGpuDevices; i++) {
+        printf("%36s: GPU %3d ", "", i);
+        for (int j = 0; j < numGpuDevices; j++)
+          printf(" %3d", prefXccTable[i][j]);
+        printf(" %3d\n", TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}));
+      }
+    }
+    Print("VALIDATE_DIRECT", validateDirect,
+          "Validate GPU destination memory %s", validateDirect ? "directly" : "via CPU staging buffer");
+    Print("VALIDATE_SOURCE", validateSource,
+          validateSource ? "Validate source after preparation" : "Do not perform source validation after prep");
+    printf("\n");
+  };
+
+  // Helper function that gets parses environment variable or sets to default value
+  static int GetEnvVar(std::string const& varname, int defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return atoi(getenv(varname.c_str()));
+    return defaultValue;
+  }
+
+  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return getenv(varname.c_str());
+    return defaultValue;
+  }
+
+  std::string GetCuMaskDesc() const
+  {
+    std::vector<std::pair<int, int>> runs;
+    int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
+    bool inRun = false;
+    std::pair<int, int> curr;
+    int used = 0;
+    for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
+      if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
+        used++;
+        if (!inRun) {
+          inRun = true;
+          curr.first = targetBit / numXccs;
+        }
+      } else {
+        if (inRun) {
+          inRun = false;
+          curr.second = targetBit / numXccs - 1;
+          runs.push_back(curr);
+        }
+      }
+    }
+    if (inRun)
+      curr.second = (cuMask.size() * 32) / numXccs - 1;
+
+    std::string result = "CUs used: (" + std::to_string(used) + ") ";
+    for (int i = 0; i < runs.size(); i++)
+    {
+      if (i) result += ",";
+      if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
+      else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
+    }
+    return result;
+  }
+
+  TransferBench::ConfigOptions ToConfigOptions()
+  {
+    TransferBench::ConfigOptions cfg;
+
+    cfg.general.numIterations      = numIterations;
+    cfg.general.numSubIterations   = numSubIterations;
+    cfg.general.numWarmups         = numWarmups;
+    cfg.general.recordPerIteration = showIterations;
+    cfg.general.useInteractive     = useInteractive;
+
+    cfg.data.alwaysValidate        = alwaysValidate;
+    cfg.data.blockBytes            = blockBytes;
+    cfg.data.byteOffset            = byteOffset;
+    cfg.data.validateDirect        = validateDirect;
+    cfg.data.validateSource        = validateSource;
+    cfg.data.fillPattern           = fillPattern;
+
+    cfg.dma.useHipEvents           = useHipEvents;
+    cfg.dma.useHsaCopy             = useHsaDma;
+
+    cfg.gfx.blockSize              = gfxBlockSize;
+    cfg.gfx.cuMask                 = cuMask;
+    cfg.gfx.prefXccTable           = prefXccTable;
+    cfg.gfx.unrollFactor           = gfxUnroll;
+    cfg.gfx.useHipEvents           = useHipEvents;
+    cfg.gfx.useMultiStream         = !useSingleStream;
+    cfg.gfx.useSingleTeam          = gfxSingleTeam;
+    cfg.gfx.waveOrder              = gfxWaveOrder;
+
+    return cfg;
+  }
+};
+
+#endif
--- a/src/client/Presets/AllToAll.hpp
+++ b/src/client/Presets/AllToAll.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "EnvVars.hpp"
+
+void AllToAllPreset(EnvVars&           ev,
+                    size_t      const  numBytesPerTransfer,
+                    std::string const  presetName)
+{
+  enum
+  {
+    A2A_COPY       = 0,
+    A2A_READ_ONLY  = 1,
+    A2A_WRITE_ONLY = 2
+  };
+  char a2aModeStr[3][20] = {"Copy", "Read-Only", "Write-Only"};
+
+  // Force single-stream mode for all-to-all benchmark
+  ev.useSingleStream = 1;
+
+  // Force to gfx unroll 2 unless explicitly set
+  ev.gfxUnroll      = EnvVars::GetEnvVar("GFX_UNROLL", 2);
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT"     , 1);
+  int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
+  int a2aMode       = EnvVars::GetEnvVar("A2A_MODE"       , 0);
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC"   , 8);
+  int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
+  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
+  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+
+  // Print off environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[AllToAll Related]\n");
+    ev.Print("A2A_DIRECT"     , a2aDirect    , a2aDirect ? "Only using direct links" : "Full all-to-all");
+    ev.Print("A2A_LOCAL"      , a2aLocal     , "%s local transfers", a2aLocal ? "Include" : "Exclude");
+    ev.Print("A2A_MODE"       , a2aMode      , a2aModeStr[a2aMode]);
+    ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
+    ev.Print("NUM_SUB_EXEC"   , numSubExecs  , "Using %d subexecutors/CUs per Transfer", numSubExecs);
+    ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
+    ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+    printf("\n");
+  }
+
+  // Validate env vars
+  if (a2aMode < 0 || a2aMode > 2) {
+    printf("[ERROR] a2aMode must be between 0 and 2\n");
+    exit(1);
+  }
+  if (numGpus < 0 || numGpus > numDetectedGpus) {
+    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    exit(1);
+  }
+
+  // Collect the number of GPU devices to use
+  int const numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
+  int const numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
+
+  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+  ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
+
+  std::map<std::pair<int, int>, int> reIndex;
+  std::vector<Transfer> transfers;
+  for (int i = 0; i < numGpus; i++) {
+    for (int j = 0; j < numGpus; j++) {
+
+      // Check whether or not to execute this pair
+      if (i == j) {
+        if (!a2aLocal) continue;
+      } else if (a2aDirect) {
+#if !defined(__NVCC__)
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+        if (hopCount != 1) continue;
+#endif
+      }
+
+      // Build Transfer and add it to list
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      if (numSrcs) transfer.srcs.push_back({memType, i});
+      if (numDsts) transfer.dsts.push_back({memType, j});
+      transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
+      transfer.exeSubIndex = -1;
+      transfer.numSubExecs = numSubExecs;
+
+      reIndex[std::make_pair(i,j)] = transfers.size();
+      transfers.push_back(transfer);
+    }
+  }
+
+  printf("GPU-GFX All-To-All benchmark:\n");
+  printf("==========================\n");
+  printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
+         numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
+  if (transfers.size() == 0) return;
+
+  // Execute Transfers
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+    for (auto const& err : results.errResults)
+      printf("%s\n", err.errMsg.c_str());
+    exit(0);
+  } else {
+    PrintResults(ev, 1, transfers, results);
+  }
+
+  // Print results
+  char separator = (ev.outputToCsv ? ',' : ' ');
+  printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
+  printf("==========================================================\n");
+  printf("SRC\\DST ");
+  for (int dst = 0; dst < numGpus; dst++)
+    printf("%cGPU %02d    ", separator, dst);
+  printf("   %cSTotal     %cActual\n", separator, separator);
+
+  double totalBandwidthGpu = 0.0;
+  double minExecutorBandwidth = std::numeric_limits<double>::max();
+  double maxExecutorBandwidth = 0.0;
+  std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
+  for (int src = 0; src < numGpus; src++) {
+    double rowTotalBandwidth = 0;
+    double executorBandwidth = 0;
+    printf("GPU %02d", src);
+    for (int dst = 0; dst < numGpus; dst++) {
+      if (reIndex.count(std::make_pair(src, dst))) {
+        int const transferIdx = reIndex[std::make_pair(src,dst)];
+        TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
+        colTotalBandwidth[dst]  += r.avgBandwidthGbPerSec;
+        rowTotalBandwidth       += r.avgBandwidthGbPerSec;
+        totalBandwidthGpu       += r.avgBandwidthGbPerSec;
+        executorBandwidth        = std::max(executorBandwidth,
+                                            results.exeResults[transfers[transferIdx].exeDevice].avgBandwidthGbPerSec);
+        printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
+      } else {
+        printf("%c%8s  ", separator, "N/A");
+      }
+    }
+    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
+    minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
+    maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
+    colTotalBandwidth[numGpus] += rowTotalBandwidth;
+  }
+  printf("\nRTotal");
+  for (int dst = 0; dst < numGpus; dst++) {
+    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
+  }
+  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus],
+         separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
+  printf("\n");
+
+  printf("Average   bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
+  printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
+  printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+
+  PrintErrors(results.errResults);
+}
--- a/src/client/Presets/HealthCheck.hpp
+++ b/src/client/Presets/HealthCheck.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+void HealthCheckPreset(EnvVars&           ev,
+                       size_t      const  numBytesPerTransfer,
+                       std::string const  presetName)
+{
+  // Check for supported platforms
+#if defined(__NVCC__)
+  printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
+  return;
+#endif
+
+  bool hasFail = false;
+
+  // Force use of single stream
+  ev.useSingleStream = 1;
+
+  TransferBench::TestResults results;
+  int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  if (numGpuDevices != 8) {
+    printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
+    exit(1);
+  }
+
+  for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
+    std::string fullName = prop.gcnArchName;
+    std::string archName = fullName.substr(0, fullName.find(':'));
+    if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
+    {
+      printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
+      exit(1);
+    }
+  }
+
+  // Pass limits
+  double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
+  double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
+  double a2aLimit  = getenv("LIMIT_A2A")  ? atof(getenv("LIMIT_A2A"))  : (int)(45 * 0.95);
+
+  // Run CPU to GPU
+
+  // Run unidirectional read from CPU to GPU
+  printf("Testing unidirectional reads from CPU ");
+  {
+    ev.gfxUnroll = 4;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+
+      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+      if (memIndex == -1) {
+        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+        exit(1);
+      }
+
+      std::vector<Transfer> transfers(1);
+      Transfer& t = transfers[0];
+      t.exeDevice = {EXE_GPU_GFX, gpuId};
+      t.numBytes  = 64*1024*1024;
+      t.srcs      = {{MEM_CPU, memIndex}};
+      t.dsts      = {};
+
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t.numSubExecs = cu;
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
+        } else {
+          PrintErrors(results.errResults);
+        }
+        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
+          passed = true;
+          break;
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+      }
+    }
+  }
+
+  // Run unidirectional write from GPU to CPU
+  printf("Testing unidirectional writes to  CPU ");
+  {
+    ev.gfxUnroll = 4;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+
+      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+      if (memIndex == -1) {
+        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+        exit(1);
+      }
+
+      std::vector<Transfer> transfers(1);
+      Transfer& t = transfers[0];
+      t.exeDevice = {EXE_GPU_GFX, gpuId};
+      t.numBytes  = 64*1024*1024;
+      t.srcs      = {};
+      t.dsts      = {{MEM_CPU, memIndex}};
+
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t.numSubExecs = cu;
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
+        } else {
+          PrintErrors(results.errResults);
+        }
+        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
+          passed = true;
+          break;
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+      }
+    }
+  }
+
+  // Run bidirectional tests
+  printf("Testing bidirectional  reads + writes ");
+  {
+    ev.gfxUnroll = 4;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+
+      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+      if (memIndex == -1) {
+        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+        exit(1);
+      }
+
+      std::vector<Transfer> transfers(2);
+      Transfer& t0 = transfers[0];
+      Transfer& t1 = transfers[1];
+
+      t0.exeDevice = {EXE_GPU_GFX, gpuId};
+      t0.numBytes  = 64*1024*1024;
+      t0.srcs      = {{MEM_CPU, memIndex}};
+      t0.dsts      = {};
+
+      t1.exeDevice = {EXE_GPU_GFX, gpuId};
+      t1.numBytes  = 64*1024*1024;
+      t1.srcs      = {};
+      t1.dsts      = {{MEM_CPU, memIndex}};
+
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t0.numSubExecs = cu;
+        t1.numSubExecs = cu;
+
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+          double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
+                        results.tfrResults[1].avgBandwidthGbPerSec);
+          bestResult = std::max(bestResult, sum);
+          if (sum >= bdirLimit) {
+            passed = true;
+            break;
+          }
+        } else {
+          PrintErrors(results.errResults);
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
+      }
+    }
+  }
+
+  // Run XGMI tests:
+  printf("Testing all-to-all XGMI copies        "); fflush(stdout);
+  {
+    // Force GFX unroll to 2 for MI300
+    ev.gfxUnroll = 2;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+
+    std::vector<Transfer> transfers;
+    for (int i = 0; i < numGpuDevices; i++) {
+      for (int j = 0; j < numGpuDevices; j++) {
+        if (i == j) continue;
+        Transfer t;
+        t.numBytes    = 64*1024*1024;
+        t.numSubExecs = 8;
+        t.exeDevice   = {EXE_GPU_GFX, i};
+        t.srcs        = {{MEM_GPU_FINE, i}};
+        t.dsts        = {{MEM_GPU_FINE, j}};
+        transfers.push_back(t);
+      }
+    }
+    std::vector<std::pair<std::pair<int,int>, double>> fails;
+
+    if (TransferBench::RunTransfers(cfg, transfers, results)) {
+      int transferIdx = 0;
+      for (int i = 0; i < numGpuDevices; i++) {
+        printf("."); fflush(stdout);
+        for (int j = 0; j < numGpuDevices; j++) {
+          if (i == j) continue;
+          double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
+          if (bw < a2aLimit) {
+            fails.push_back(std::make_pair(std::make_pair(i,j), bw));
+          }
+          transferIdx++;
+        }
+      }
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
+      }
+    }
+  }
+  exit(hasFail ? 1 : 0);
+}
--- a/src/client/Presets/OneToAll.hpp
+++ b/src/client/Presets/OneToAll.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+void OneToAllPreset(EnvVars&           ev,
+                    size_t      const  numBytesPerTransfer,
+                    std::string const  presetName)
+{
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+  if (numDetectedGpus < 2) {
+    printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n");
+    exit(1);
+  }
+
+  // Collect env vars for this preset
+  int         numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int         numSubExecs   = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
+  int         exeIndex      = EnvVars::GetEnvVar("EXE_INDEX",   0);
+  int         sweepDir      = EnvVars::GetEnvVar("SWEEP_DIR",  0);
+  std::string sweepDst      = EnvVars::GetEnvVar("SWEEP_DST",  "G");
+  std::string sweepExe      = EnvVars::GetEnvVar("SWEEP_EXE",  "G");
+  std::string sweepSrc      = EnvVars::GetEnvVar("SWEEP_SRC",  "G");
+  int         sweepMin      = EnvVars::GetEnvVar("SWEEP_MIN",  1);
+  int         sweepMax      = EnvVars::GetEnvVar("SWEEP_MAX",  numGpuDevices);
+
+  // Display environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[One-To-All Related]\n");
+    ev.Print("NUM_GPU_DEVICES", numGpuDevices,    "Using %d GPUs", numGpuDevices);
+    ev.Print("NUM_GPU_SE",      numSubExecs,      "Using %d subExecutors/CUs per Transfer", numSubExecs);
+    ev.Print("EXE_INDEX",       exeIndex,         "Executing on GPU %d", exeIndex);
+    ev.Print("SWEEP_DIR",       sweepDir,         "Direction of transfer");
+    ev.Print("SWEEP_DST",       sweepDst.c_str(), "DST memory types to sweep");
+    ev.Print("SWEEP_EXE",       sweepExe.c_str(), "Executor type to use");
+    ev.Print("SWEEP_MAX",       sweepMax,         "Maximum number of peers");
+    ev.Print("SWEEP_MIN",       sweepMin,         "Minimum number of peers");
+    ev.Print("SWEEP_SRC",       sweepSrc.c_str(), "SRC memory types to sweep");
+    printf("\n");
+  }
+
+  // Perform validation
+  for (auto ch : sweepExe) {
+    if (ch != 'G' && ch != 'D') {
+      printf("[ERROR] Unrecognized executor type '%c' specified\n", ch);
+      exit(1);
+    }
+  }
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  char const sep = (ev.outputToCsv ? ',' : ' ');
+
+  for (char src : sweepSrc) for (char exe : sweepExe) for (char dst : sweepDst) {
+    // Skip invalid configurations
+    if ((exe == 'D' && (src == 'N' || dst == 'N')) || (src == 'N' && dst == 'N')) continue;
+
+    printf("Executing (%c%s -> %c%d -> %c%s)\n",
+           src, src == 'N' ? "" : (sweepDir == 0 ? std::to_string(exeIndex).c_str() : std::string("*").c_str()),
+           exe, exeIndex,
+           dst, dst == 'N' ? "" : sweepDir == 0 ? std::string("*").c_str() : std::to_string(exeIndex).c_str());
+
+    for (int i = 0; i < numGpuDevices; i++) {
+      if (i == exeIndex) continue;
+      printf("   GPU %-3d  %c", i, sep);
+    }
+    printf("\n");
+    if (!ev.outputToCsv) {
+      for (int i = 0; i < numGpuDevices-1; i++)
+        printf("-------------");
+      printf("\n");
+    }
+
+    for (int p = sweepMin; p <= sweepMax; p++) {
+      for (int bitmask = 0; bitmask < (1<<numGpuDevices); bitmask++) {
+        if (bitmask & (1<<exeIndex) || __builtin_popcount(bitmask) != p) continue;
+
+        std::vector<Transfer> transfers;
+        for (int i = 0; i < numGpuDevices; i++) {
+          if (bitmask & (1<<i)) {
+            Transfer t;
+            CheckForError(TransferBench::CharToExeType(exe, t.exeDevice.exeType));
+            t.exeDevice.exeIndex = exeIndex;
+            t.exeSubIndex = -1;
+            t.numSubExecs = numSubExecs;
+            t.numBytes    = numBytesPerTransfer;
+
+            if (src == 'N') {
+              t.srcs.clear();
+            } else {
+              t.srcs.resize(1);
+              CheckForError(TransferBench::CharToMemType(src, t.srcs[0].memType));
+              t.srcs[0].memIndex = sweepDir == 0 ? exeIndex : i;
+            }
+
+            if (dst == 'N') {
+              t.dsts.clear();
+            } else {
+              t.dsts.resize(1);
+              CheckForError(TransferBench::CharToMemType(dst, t.dsts[0].memType));
+              t.dsts[0].memIndex = sweepDir == 0 ? i : exeIndex;
+            }
+            transfers.push_back(t);
+          }
+        }
+        if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+          PrintErrors(results.errResults);
+          exit(1);
+        }
+
+        int counter = 0;
+        for (int i = 0; i < numGpuDevices; i++) {
+          if (bitmask & (1<<i))
+            printf("  %8.3f  %c", results.tfrResults[counter++].avgBandwidthGbPerSec, sep);
+          else if (i != exeIndex)
+            printf("            %c", sep);
+        }
+
+        printf(" %d %d", p, numSubExecs);
+        for (auto i = 0; i < transfers.size(); i++) {
+          printf(" (%s %c%d %s)",
+                 MemDevicesToStr(transfers[i].srcs).c_str(),
+                 ExeTypeStr[transfers[i].exeDevice.exeType], transfers[i].exeDevice.exeIndex,
+                 MemDevicesToStr(transfers[i].dsts).c_str());
+        }
+        printf("\n");
+      }
+    }
+  }
+}
--- a/src/client/Presets/PeerToPeer.hpp
+++ b/src/client/Presets/PeerToPeer.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+void PeerToPeerPreset(EnvVars&           ev,
+                      size_t      const  numBytesPerTransfer,
+                      std::string const  presetName)
+{
+  int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int useDmaCopy     = EnvVars::GetEnvVar("USE_GPU_DMA",     0);
+  int numCpuDevices  = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
+  int numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE",      4);
+  int numGpuDevices  = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE",      useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}));
+  int p2pMode        = EnvVars::GetEnvVar("P2P_MODE",        0);
+  int useFineGrain   = EnvVars::GetEnvVar("USE_FINE_GRAIN",  0);
+  int useRemoteRead  = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+
+  // Display environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    int outputToCsv = ev.outputToCsv;
+    if (!outputToCsv) printf("[P2P Related]\n");
+    ev.Print("NUM_CPU_DEVICES", numCpuDevices,  "Using %d CPUs", numCpuDevices);
+    ev.Print("NUM_CPU_SE",      numCpuSubExecs, "Using %d CPU threads per Transfer", numCpuSubExecs);
+    ev.Print("NUM_GPU_DEVICES", numGpuDevices,  "Using %d GPUs", numGpuDevices);
+    ev.Print("NUM_GPU_SE",      numGpuSubExecs, "Using %d GPU subexecutors/CUs per Transfer", numGpuSubExecs);
+    ev.Print("P2P_MODE",        p2pMode,        "Running %s transfers", p2pMode == 0 ? "Uni + Bi" :
+                                                                        p2pMode == 1 ? "Unidirectional"
+                                                                                     : "Bidirectional");
+    ev.Print("USE_FINE_GRAIN",  useFineGrain,   "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    ev.Print("USE_GPU_DMA",     useDmaCopy,     "Using GPU-%s as GPU executor", useDmaCopy ? "DMA" : "GFX");
+    ev.Print("USE_REMOTE_READ", useRemoteRead,  "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+    printf("\n");
+  }
+
+  char const separator = ev.outputToCsv ? ',' : ' ';
+  printf("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer);
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  // Collect the number of available CPUs/GPUs on this machine
+  int const numDevices = numCpuDevices + numGpuDevices;
+
+  // Perform unidirectional / bidirectional
+  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) {
+    if (p2pMode == 1 && isBidirectional == 1 ||
+        p2pMode == 2 && isBidirectional == 0) continue;
+
+    printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
+           useRemoteRead ? "Remote" : "Local",
+           useRemoteRead ? "Local" : "Remote",
+           useDmaCopy    ? "DMA"   : "GFX");
+
+    // Print header
+    if (isBidirectional) {
+      printf("%12s", "SRC\\DST");
+    } else {
+      if (useRemoteRead)
+        printf("%12s", "SRC\\EXE+DST");
+      else
+        printf("%12s", "SRC+EXE\\DST");
+    }
+    if (ev.outputToCsv) printf(",");
+    for (int i = 0; i < numCpuDevices; i++) {
+      printf("%7s %02d", "CPU", i);
+      if (ev.outputToCsv) printf(",");
+    }
+    if (numCpuDevices > 0) printf("   ");
+    for (int i = 0; i < numGpuDevices; i++) {
+      printf("%7s %02d", "GPU", i);
+      if (ev.outputToCsv) printf(",");
+    }
+    printf("\n");
+
+    double avgBwSum[2][2] = {};
+    int    avgCount[2][2] = {};
+
+    ExeType const gpuExeType = useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+
+    // Loop over all possible src/dst pairs
+    for (int src = 0; src < numDevices; src++) {
+      MemType const srcType  = (src < numCpuDevices ? MEM_CPU : MEM_GPU);
+      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpuDevices);
+      MemType const srcTypeActual = ((useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
+                                     (useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
+                                                                               srcType);
+      std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
+      std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
+      std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
+      std::vector<std::vector<double>> stdDev(isBidirectional + 1);
+
+      if (src == numCpuDevices && src != 0) printf("\n");
+      for (int dst = 0; dst < numDevices; dst++) {
+        MemType const dstType  = (dst < numCpuDevices ? MEM_CPU : MEM_GPU);
+        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpuDevices);
+        MemType const dstTypeActual = ((useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
+                                       (useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
+                                                                                 dstType);
+        // Prepare Transfers
+        std::vector<Transfer> transfers(isBidirectional + 1);
+
+        // SRC -> DST
+        transfers[0].numBytes = numBytesPerTransfer;
+        transfers[0].srcs.push_back({srcTypeActual, srcIndex});
+        transfers[0].dsts.push_back({dstTypeActual, dstIndex});
+        transfers[0].exeDevice = {IsGpuMemType(useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU,
+                                  (useRemoteRead ? dstIndex : srcIndex)};
+        transfers[0].exeSubIndex = -1;
+        transfers[0].numSubExecs = (transfers[0].exeDevice.exeType == gpuExeType) ? numGpuSubExecs : numCpuSubExecs;
+
+        // DST -> SRC
+        if (isBidirectional) {
+          transfers[1].numBytes = numBytesPerTransfer;
+          transfers[1].srcs.push_back({dstTypeActual, dstIndex});
+          transfers[1].dsts.push_back({srcTypeActual, srcIndex});
+          transfers[1].exeDevice = {IsGpuMemType(useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU,
+                                    (useRemoteRead ? srcIndex : dstIndex)};
+          transfers[1].exeSubIndex = -1;
+          transfers[1].numSubExecs = (transfers[1].exeDevice.exeType == gpuExeType) ? numGpuSubExecs : numCpuSubExecs;
+        }
+
+        bool skipTest = false;
+
+        // Abort if executing on NUMA node with no CPUs
+        for (int i = 0; i <= isBidirectional; i++) {
+          if (transfers[i].exeDevice.exeType == EXE_CPU &&
+              TransferBench::GetNumSubExecutors(transfers[i].exeDevice) == 0) {
+            skipTest = true;
+            break;
+          }
+
+#if defined(__NVCC__)
+          // NVIDIA platform cannot access GPU memory directly from CPU executors
+          if (transfers[i].exeDevice.exeType == EXE_CPU && (IsGpuMemType(srcType) || IsGpuMemType(dstType))) {
+            skipTest = true;
+            break;
+          }
+#endif
+        }
+
+        if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;
+
+        if (!skipTest) {
+          if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+            for (auto const& err : results.errResults)
+              printf("%s\n", err.errMsg.c_str());
+            exit(1);
+          }
+
+          for (int dir = 0; dir <= isBidirectional; dir++) {
+            double const avgBw   = results.tfrResults[dir].avgBandwidthGbPerSec;
+            avgBandwidth[dir].push_back(avgBw);
+
+            if (!(srcType == dstType && srcIndex == dstIndex)) {
+              avgBwSum[srcType][dstType] += avgBw;
+              avgCount[srcType][dstType]++;
+            }
+
+            if (ev.showIterations) {
+              double minTime = results.tfrResults[dir].perIterMsec[0];
+              double maxTime = minTime;
+              double varSum  = 0;
+              for (int i = 0; i < results.tfrResults[dir].perIterMsec.size(); i++) {
+                minTime = std::min(minTime, results.tfrResults[dir].perIterMsec[i]);
+                maxTime = std::max(maxTime, results.tfrResults[dir].perIterMsec[i]);
+                double const bw  = (transfers[dir].numBytes / 1.0E9) / results.tfrResults[dir].perIterMsec[i] * 1000.0f;
+                double const delta = (avgBw - bw);
+                varSum += delta * delta;
+              }
+              double const minBw = (transfers[dir].numBytes / 1.0E9) / maxTime * 1000.0f;
+              double const maxBw = (transfers[dir].numBytes / 1.0E9) / minTime * 1000.0f;
+              double const stdev = sqrt(varSum / results.tfrResults[dir].perIterMsec.size());
+              minBandwidth[dir].push_back(minBw);
+              maxBandwidth[dir].push_back(maxBw);
+              stdDev[dir].push_back(stdev);
+            }
+          }
+        } else {
+          for (int dir = 0; dir <= isBidirectional; dir++) {
+            avgBandwidth[dir].push_back(0);
+            minBandwidth[dir].push_back(0);
+            maxBandwidth[dir].push_back(0);
+            stdDev[dir].push_back(-1.0);
+          }
+        }
+      }
+
+      for (int dir = 0; dir <= isBidirectional; dir++) {
+        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
+        if (ev.outputToCsv) printf(",");
+
+        for (int dst = 0; dst < numDevices; dst++) {
+          if (dst == numCpuDevices && dst != 0) printf("   ");
+          double const avgBw = avgBandwidth[dir][dst];
+
+          if (avgBw == 0.0)
+            printf("%10s", "N/A");
+          else
+            printf("%10.2f", avgBw);
+          if (ev.outputToCsv) printf(",");
+        }
+        printf("\n");
+
+        if (ev.showIterations) {
+          // minBw
+          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
+          if (ev.outputToCsv) printf(",");
+          for (int i = 0; i < numDevices; i++) {
+            double const minBw = minBandwidth[dir][i];
+            if (i == numCpuDevices && i != 0) printf("   ");
+            if (minBw == 0.0)
+              printf("%10s", "N/A");
+            else
+              printf("%10.2f", minBw);
+            if (ev.outputToCsv) printf(",");
+          }
+          printf("\n");
+
+          // maxBw
+          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
+          if (ev.outputToCsv) printf(",");
+          for (int i = 0; i < numDevices; i++) {
+            double const maxBw = maxBandwidth[dir][i];
+            if (i == numCpuDevices && i != 0) printf("   ");
+            if (maxBw == 0.0)
+              printf("%10s", "N/A");
+            else
+              printf("%10.2f", maxBw);
+            if (ev.outputToCsv) printf(",");
+          }
+          printf("\n");
+
+          // stddev
+          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
+          if (ev.outputToCsv) printf(",");
+          for (int i = 0; i < numDevices; i++) {
+            double const sd = stdDev[dir][i];
+            if (i == numCpuDevices && i != 0) printf("   ");
+            if (sd == -1.0)
+              printf("%10s", "N/A");
+            else
+              printf("%10.2f", sd);
+            if (ev.outputToCsv) printf(",");
+          }
+          printf("\n");
+        }
+        fflush(stdout);
+      }
+
+      if (isBidirectional) {
+        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
+        if (ev.outputToCsv) printf(",");
+        for (int dst = 0; dst < numDevices; dst++) {
+          double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
+          if (dst == numCpuDevices && dst != 0) printf("   ");
+          if (sumBw == 0.0)
+            printf("%10s", "N/A");
+          else
+            printf("%10.2f", sumBw);
+          if (ev.outputToCsv) printf(",");
+        }
+        printf("\n");
+        if (src < numDevices - 1) printf("\n");
+      }
+    }
+
+    if (!ev.outputToCsv) {
+      printf("                         ");
+      for (int srcType : {MEM_CPU, MEM_GPU})
+        for (int dstType : {MEM_CPU, MEM_GPU})
+          printf("  %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
+      printf("\n");
+
+      printf("Averages (During %s):",  isBidirectional ? " BiDir" : "UniDir");
+      for (int srcType : {MEM_CPU, MEM_GPU})
+        for (int dstType : {MEM_CPU, MEM_GPU}) {
+          if (avgCount[srcType][dstType])
+            printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
+          else
+            printf("%10s", "N/A");
+        }
+      printf("\n\n");
+    }
+  }
+}
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// Included after EnvVars and Executors
+#include "AllToAll.hpp"
+#include "HealthCheck.hpp"
+#include "OneToAll.hpp"
+#include "PeerToPeer.hpp"
+#include "Schmoo.hpp"
+#include "Sweep.hpp"
+#include <map>
+
+typedef void (*PresetFunc)(EnvVars&          ev,
+                           size_t      const numBytesPerTransfer,
+                           std::string const presetName);
+
+std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
+{
+  {"a2a",         {AllToAllPreset,   "Tests parallel transfers between all pairs of GPU devices"}},
+  {"healthcheck", {HealthCheckPreset,"Simple bandwidth health check (MI300X series only)"}},
+  {"one2all",     {OneToAllPreset,   "Test all subsets of parallel transfers from one GPU to all others"}},
+  {"p2p"   ,      {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}},
+  {"rsweep",      {SweepPreset,      "Randomly sweep through sets of Transfers"}},
+  {"schmoo",      {SchmooPreset,     "Scaling tests for local/remote read/write/copy"}},
+  {"sweep",       {SweepPreset,      "Ordered sweep through sets of Transfers"}},
+};
+
+void DisplayPresets()
+{
+  printf("\nAvailable Preset Benchmarks:\n");
+  printf("============================\n");
+  for (auto const& x : presetFuncMap)
+    printf("   %15s - %s\n", x.first.c_str(), x.second.second.c_str());
+}
+
+int RunPreset(EnvVars&       ev,
+              size_t   const numBytesPerTransfer,
+              int      const argc,
+              char**   const argv)
+{
+  std::string preset = (argc > 1 ? argv[1] : "");
+  if (presetFuncMap.count(preset)) {
+    (presetFuncMap[preset].first)(ev, numBytesPerTransfer, preset);
+    return 1;
+  }
+  return 0;
+}
--- a/src/client/Presets/Schmoo.hpp
+++ b/src/client/Presets/Schmoo.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+void SchmooPreset(EnvVars&           ev,
+                  size_t      const  numBytesPerTransfer,
+                  std::string const  presetName)
+{
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  if (numDetectedGpus < 2) {
+    printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
+    exit(1);
+  }
+
+  // Collect env vars for this preset
+  int localIdx     = EnvVars::GetEnvVar("LOCAL_IDX",      0);
+  int remoteIdx    = EnvVars::GetEnvVar("REMOTE_IDX",     1);
+  int sweepMax     = EnvVars::GetEnvVar("SWEEP_MAX",      32);
+  int sweepMin     = EnvVars::GetEnvVar("SWEEP_MIN",      1);
+  int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
+
+  // Display environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    int outputToCsv = ev.outputToCsv;
+    if (!outputToCsv) printf("[Schmoo Related]\n");
+    ev.Print("LOCAL_IDX",      localIdx,     "Local GPU index");
+    ev.Print("REMOTE_IDX",     remoteIdx,    "Remote GPU index");
+    ev.Print("SWEEP_MAX",      sweepMax,     "Max number of subExecutors to use");
+    ev.Print("SWEEP_MIN",      sweepMin,     "Min number of subExecutors to use");
+    ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    printf("\n");
+  }
+
+  // Validate env vars
+  if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) {
+    printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
+    exit(1);
+  }
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  char memChar = useFineGrain ? 'F' : 'G';
+  printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
+  printf("       | Local Read  | Local Write | Local Copy  | Remote Read | Remote Write| Remote Copy |\n");
+  printf("  #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
+         memChar, localIdx, localIdx,
+         localIdx, memChar, localIdx,
+         memChar, localIdx, localIdx, memChar, localIdx,
+         memChar, remoteIdx, localIdx,
+         localIdx, memChar, remoteIdx,
+         memChar, localIdx, localIdx, memChar, remoteIdx);
+  printf("|------|-------------|-------------|-------------|-------------|-------------|-------------|\n");
+
+  std::vector<Transfer> transfers(1);
+  Transfer& t   = transfers[0];
+  t.exeDevice   = {EXE_GPU_GFX, localIdx};
+  t.exeSubIndex = -1;
+  t.numBytes    = numBytesPerTransfer;
+
+  MemType memType = (useFineGrain ? MEM_GPU_FINE : MEM_GPU);
+
+  for (int numCUs = sweepMin; numCUs <= sweepMax; numCUs++) {
+    t.numSubExecs = numCUs;
+
+    // Local Read
+    t.srcs = {{memType, localIdx}};
+    t.dsts = {};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const localRead = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Local Write
+    t.srcs = {};
+    t.dsts = {{memType, localIdx}};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Local Copy
+    t.srcs = {{memType, localIdx}};
+    t.dsts = {{memType, localIdx}};
+    t.srcs = {};
+    t.dsts = {{memType, localIdx}};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Remote Read
+    t.srcs = {{memType, remoteIdx}};
+    t.dsts = {};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Remote Write
+    t.srcs = {};
+    t.dsts = {{memType, remoteIdx}};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Remote Copy
+    t.srcs = {{memType, localIdx}};
+    t.dsts = {{memType, remoteIdx}};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    printf("   %3d   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f  \n",
+           numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
+  }
+}
--- a/src/client/Presets/Sweep.hpp
+++ b/src/client/Presets/Sweep.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
+{
+  fprintf(fp, "# Test %d\n", testNum);
+  fprintf(fp, "%d", -1 * (int)transfers.size());
+  for (auto const& transfer : transfers)
+  {
+    fprintf(fp, " (%s->%c%d->%s %d %lu)",
+            MemDevicesToStr(transfer.srcs).c_str(),
+            ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex,
+            MemDevicesToStr(transfer.dsts).c_str(),
+            transfer.numSubExecs,
+            transfer.numBytes);
+  }
+  fprintf(fp, "\n");
+  fflush(fp);
+}
+
+void SweepPreset(EnvVars&           ev,
+                 size_t      const  numBytesPerTransfer,
+                 std::string const  presetName)
+{
+  bool const isRandom = (presetName == "rsweep");
+
+  int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars and set defaults
+  int         continueOnErr  = EnvVars::GetEnvVar("CONTINUE_ON_ERROR"   , 0);
+  int         numCpuDevices  = EnvVars::GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
+  int         numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE"          , 4);
+  int         numGpuDevices  = EnvVars::GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
+  int         numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE"          , 4);
+  std::string sweepDst       = EnvVars::GetEnvVar("SWEEP_DST"           , "CG");
+  std::string sweepExe       = EnvVars::GetEnvVar("SWEEP_EXE"           , "CDG");
+  int         sweepMax       = EnvVars::GetEnvVar("SWEEP_MAX"           , 24);
+  int         sweepMin       = EnvVars::GetEnvVar("SWEEP_MIN"           , 1);
+  int         sweepRandBytes = EnvVars::GetEnvVar("SWEEP_RAND_BYTES"    , 0);
+  int         sweepSeed      = EnvVars::GetEnvVar("SWEEP_SEED"          , time(NULL));
+  std::string sweepSrc       = EnvVars::GetEnvVar("SWEEP_SRC"           , "CG");
+  int         sweepTestLimit = EnvVars::GetEnvVar("SWEEP_TEST_LIMIT"    , 0);
+  int         sweepTimeLimit = EnvVars::GetEnvVar("SWEEP_TIME_LIMIT"    , 0);
+  int         sweepXgmiMin   = EnvVars::GetEnvVar("SWEEP_XGMI_MIN"      , 0);
+  int         sweepXgmiMax   = EnvVars::GetEnvVar("SWEEP_XGMI_MAX"      , -1);
+
+  auto generator = new std::default_random_engine(sweepSeed);
+
+  // Display env var settings
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    int outputToCsv = ev.outputToCsv;
+    if (!outputToCsv) printf("[Sweep Related]\n");
+    ev.Print("CONTINUE_ON_ERROR", continueOnErr,    continueOnErr ? "Continue on mismatch error" : "Stop after first error");
+    ev.Print("NUM_CPU_DEVICES",   numCpuDevices,    "Using %d CPUs", numCpuDevices);
+    ev.Print("NUM_CPU_SE",        numCpuSubExecs,   "Using %d CPU threads per CPU executed Transfer", numCpuSubExecs);
+    ev.Print("NUM_GPU_DEVICES",   numGpuDevices,    "Using %d GPUs", numGpuDevices);
+    ev.Print("NUM_GPU_SE",        numGpuSubExecs,   "Using %d subExecutors/CUs per GPU executed Transfer", numGpuSubExecs);
+    ev.Print("SWEEP_DST",         sweepDst.c_str(), "Destination Memory Types to sweep");
+    ev.Print("SWEEP_EXE",         sweepExe.c_str(), "Executor Types to sweep");
+    ev.Print("SWEEP_MAX",         sweepMax,         "Max simultaneous transfers (0 = no limit)");
+    ev.Print("SWEEP_MIN",         sweepMin,         "Min simultaenous transfers");
+    ev.Print("SWEEP_RAND_BYTES",  sweepRandBytes,   "Using %s number of bytes per Transfer", (sweepRandBytes ? "random" : "constant"));
+    ev.Print("SWEEP_SEED",        sweepSeed,        "Random seed set to %d", sweepSeed);
+    ev.Print("SWEEP_SRC",         sweepSrc.c_str(), "Source Memory Types to sweep");
+    ev.Print("SWEEP_TEST_LIMIT",  sweepTestLimit,   "Max number of tests to run during sweep (0 = no limit)");
+    ev.Print("SWEEP_TIME_LIMIT",  sweepTimeLimit,   "Max number of seconds to run sweep for  (0 = no limit)");
+    ev.Print("SWEEP_XGMI_MAX",    sweepXgmiMax,     "Max number of XGMI hops for Transfers  (-1 = no limit)");
+    ev.Print("SWEEP_XGMI_MIN",    sweepXgmiMin,     "Min number of XGMI hops for Transfers");
+    printf("\n");
+  }
+
+  // Validate env vars
+  for (auto ch : sweepSrc) {
+    if (!strchr(MemTypeStr, ch)) {
+      printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
+      exit(1);
+    }
+    if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) {
+      printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
+      exit(1);
+    }
+  }
+
+  for (auto ch : sweepDst) {
+    if (!strchr(MemTypeStr, ch)) {
+      printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
+      exit(1);
+    }
+    if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) {
+      printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
+      exit(1);
+    }
+  }
+
+  for (auto ch : sweepExe) {
+    if (!strchr(ExeTypeStr, ch)) {
+      printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
+      exit(1);
+    }
+    if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) {
+      printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
+      exit(1);
+    }
+  }
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
+  std::vector<ExeDevice> exeList;
+  for (auto exe : sweepExe) {
+    ExeType exeType;
+    CharToExeType(exe, exeType);
+    if (IsGpuExeType(exeType)) {
+      for (int exeIndex = 0; exeIndex < numGpuDevices; ++exeIndex)
+        exeList.push_back({exeType, exeIndex});
+    }
+    else if (IsCpuExeType(exeType)) {
+      for (int exeIndex = 0; exeIndex < numCpuDevices; ++exeIndex) {
+        // Skip NUMA nodes that have no CPUs (e.g. CXL)
+        if (TransferBench::GetNumSubExecutors({EXE_CPU, exeIndex}) == 0) continue;
+        exeList.push_back({exeType, exeIndex});
+      }
+    }
+  }
+  int numExes = exeList.size();
+
+  std::vector<MemDevice> srcList;
+  for (auto src : sweepSrc) {
+    MemType srcType;
+    CharToMemType(src, srcType);
+    int const numDevices = (srcType == MEM_NULL) ? 1 : IsGpuMemType(srcType) ? numGpuDevices : numCpuDevices;
+
+    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
+      srcList.push_back({srcType, srcIndex});
+  }
+  int numSrcs = srcList.size();
+
+
+  std::vector<MemDevice> dstList;
+  for (auto dst : sweepDst) {
+    MemType dstType;
+    CharToMemType(dst, dstType);
+    int const numDevices = (dstType == MEM_NULL) ? 1 : IsGpuMemType(dstType) ? numGpuDevices : numCpuDevices;
+
+    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
+      dstList.push_back({dstType, dstIndex});
+  }
+  int numDsts = dstList.size();
+
+  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
+  struct TransferInfo
+  {
+    MemDevice srcMem;
+    ExeDevice exeDevice;
+    MemDevice dstMem;
+  };
+
+  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
+  bool const useXgmiOnly = (sweepXgmiMin > 0 || sweepXgmiMax > 0);
+
+  std::vector<TransferInfo> possibleTransfers;
+  TransferInfo tinfo;
+  for (int i = 0; i < numExes; ++i) {
+    // Skip CPU executors if XGMI link must be used
+    if (useXgmiOnly && !IsGpuExeType(exeList[i].exeType)) continue;
+    tinfo.exeDevice = exeList[i];
+
+    bool isXgmiSrc  = false;
+    int  numHopsSrc = 0;
+    for (int j = 0; j < numSrcs; ++j) {
+      if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(srcList[j].memType)) {
+        if (exeList[i].exeIndex != srcList[j].memIndex) {
+#if defined(__NVCC__)
+          isXgmiSrc = false;
+#else
+          uint32_t exeToSrcLinkType, exeToSrcHopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
+                                                srcList[j].memIndex,
+                                                &exeToSrcLinkType,
+                                                &exeToSrcHopCount));
+          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
+#endif
+        } else {
+          isXgmiSrc = true;
+          numHopsSrc = 0;
+        }
+
+        // Skip this SRC if it is not XGMI but only XGMI links may be used
+        if (useXgmiOnly && !isXgmiSrc) continue;
+
+        // Skip this SRC if XGMI distance is already past limit
+        if (sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > sweepXgmiMax) continue;
+      } else if (srcList[j].memType != MEM_NULL && useXgmiOnly) continue;
+
+      tinfo.srcMem = srcList[j];
+
+      bool isXgmiDst = false;
+      int  numHopsDst = 0;
+      for (int k = 0; k < numDsts; ++k) {
+        if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(dstList[k].memType)) {
+          if (exeList[i].exeIndex != dstList[k].memIndex) {
+#if defined(__NVCC__)
+            isXgmiSrc = false;
+#else
+            uint32_t exeToDstLinkType, exeToDstHopCount;
+            HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
+                                                  dstList[k].memIndex,
+                                                  &exeToDstLinkType,
+                                                  &exeToDstHopCount));
+            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+            if (isXgmiDst) numHopsDst = exeToDstHopCount;
+#endif
+          } else {
+            isXgmiDst = true;
+            numHopsDst = 0;
+          }
+        }
+
+        // Skip this DST if it is not XGMI but only XGMI links may be used
+        if (dstList[k].memType != MEM_NULL && useXgmiOnly && !isXgmiDst) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
+        if (sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < sweepXgmiMin)) continue;
+
+        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
+        if (sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > sweepXgmiMax) continue;
+
+#if defined(__NVCC__)
+        // Skip CPU executors on GPU memory on NVIDIA platform
+        if (IsCpuExeType(exeList[i].exeType) && (IsGpuMemType(dstList[j].memType) || IsGpuMemType(dstList[k].memType)))
+          continue;
+#endif
+
+        tinfo.dstMem = dstList[k];
+
+        // Skip if there is no src and dst
+        if (tinfo.srcMem.memType == MEM_NULL && tinfo.dstMem.memType == MEM_NULL) continue;
+
+        possibleTransfers.push_back(tinfo);
+      }
+    }
+  }
+
+  int const numPossible = (int)possibleTransfers.size();
+  int maxParallelTransfers = (sweepMax == 0 ? numPossible : sweepMax);
+
+  if (sweepMin > numPossible) {
+    printf("No valid test configurations exist\n");
+    return;
+  }
+
+  if (ev.outputToCsv) {
+    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
+           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
+  }
+
+  int numTestsRun = 0;
+  int M = sweepMin;
+  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
+  std::uniform_int_distribution<int> distribution(sweepMin, maxParallelTransfers);
+
+  // Log sweep to configuration file
+  FILE *fp = fopen("lastSweep.cfg", "w");
+  if (!fp) {
+    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
+    exit(1);
+  }
+
+  // Create bitmask of numPossible triplets, of which M will be chosen
+  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
+  auto cpuStart = std::chrono::high_resolution_clock::now();
+  while (1)  {
+    if (isRandom) {
+      // Pick random number of simultaneous transfers to execute
+      // NOTE: This currently skews distribution due to some #s having more possibilities than others
+      M = distribution(*generator);
+
+      // Generate a random bitmask
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+      std::shuffle(bitmask.begin(), bitmask.end(), *generator);
+    }
+
+    // Convert bitmask to list of Transfers
+    std::vector<Transfer> transfers;
+    for (int value = 0; value < numPossible; ++value) {
+      if (bitmask[value]) {
+        // Convert integer value to (SRC->EXE->DST) triplet
+        Transfer transfer;
+        if (possibleTransfers[value].srcMem.memType != MEM_NULL)
+          transfer.srcs.push_back(possibleTransfers[value].srcMem);
+        transfer.exeDevice      = possibleTransfers[value].exeDevice;
+        if (possibleTransfers[value].dstMem.memType != MEM_NULL)
+          transfer.dsts.push_back(possibleTransfers[value].dstMem);
+        transfer.exeSubIndex    = -1;
+        transfer.numSubExecs    = IsGpuExeType(transfer.exeDevice.exeType) ? numGpuSubExecs : numCpuSubExecs;
+        transfer.numBytes       = sweepRandBytes ? randSize(*generator) * sizeof(float) : numBytesPerTransfer;
+        transfers.push_back(transfer);
+      }
+    }
+
+    LogTransfers(fp, ++numTestsRun, transfers);
+
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      if (!continueOnErr) exit(1);
+    } else {
+      PrintResults(ev, numTestsRun, transfers, results);
+    }
+
+    // Check for test limit
+    if (numTestsRun == sweepTestLimit) {
+      printf("Test limit reached\n");
+      break;
+    }
+
+    // Check for time limit
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+    if (sweepTimeLimit && totalCpuTime > sweepTimeLimit) {
+      printf("Time limit exceeded\n");
+      break;
+    }
+
+    // Increment bitmask if not random sweep
+    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end())) {
+      M++;
+      // Check for completion
+      if (M > maxParallelTransfers) {
+        printf("Sweep complete\n");
+        break;
+      }
+      for (int i = 0; i < numPossible; i++)
+        bitmask[i] = (i < M) ? 1 : 0;
+    }
+  }
+  fclose(fp);
+}
--- a/src/client/Topology.hpp
+++ b/src/client/Topology.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+#include "TransferBench.hpp"
+
+static int RemappedCpuIndex(int origIdx)
+{
+  static std::vector<int> remappingCpu;
+
+  // Build CPU remapping on first use
+  // Skip numa nodes that are not configured
+  if (remappingCpu.empty()) {
+    for (int node = 0; node <= numa_max_node(); node++)
+      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
+        remappingCpu.push_back(node);
+  }
+  return remappingCpu[origIdx];
+}
+
+void DisplayTopology(bool outputToCsv)
+{
+  int numCpus = TransferBench::GetNumExecutors(EXE_CPU);
+  int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  char sep = (outputToCsv ? ',' : '|');
+
+  if (outputToCsv) {
+    printf("NumCpus,%d\n", numCpus);
+    printf("NumGpus,%d\n", numGpus);
+  } else {
+    printf("\nDetected Topology:\n");
+    printf("==================\n");
+    printf("  %d configured CPU NUMA node(s) [%d total]\n", numCpus, numa_max_node() + 1);
+    printf("  %d GPU device(s)\n", numGpus);
+  }
+
+  // Print out detected CPU topology
+  printf("\n            %c", sep);
+  for (int j = 0; j < numCpus; j++)
+    printf("NUMA %02d%c", j, sep);
+  printf(" #Cpus %c Closest GPU(s)\n", sep);
+
+  if (!outputToCsv) {
+    printf("------------+");
+    for (int j = 0; j <= numCpus; j++)
+      printf("-------+");
+    printf("---------------\n");
+  }
+
+  for (int i = 0; i < numCpus; i++) {
+    int nodeI = RemappedCpuIndex(i);
+    printf("NUMA %02d (%02d)%c", i, nodeI, sep);
+    for (int j = 0; j < numCpus; j++) {
+      int nodeJ = RemappedCpuIndex(j);
+      int numaDist = numa_distance(nodeI, nodeJ);
+      printf(" %5d %c", numaDist, sep);
+    }
+
+    int numCpuCores = 0;
+    for (int j = 0; j < numa_num_configured_cpus(); j++)
+      if (numa_node_of_cpu(j) == nodeI) numCpuCores++;
+    printf(" %5d %c", numCpuCores, sep);
+
+    for (int j = 0; j < numGpus; j++) {
+      if (TransferBench::GetClosestCpuNumaToGpu(j) == nodeI) {
+        printf(" %d", j);
+      }
+    }
+    printf("\n");
+  }
+  printf("\n");
+
+  // Print out detected GPU topology
+
+#if defined(__NVCC__)
+  for (int i = 0; i < numGpus; i++) {
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, i));
+    printf(" GPU %02d | %s\n", i, prop.name);
+  }
+  // No further topology detection done for NVIDIA platforms
+  return;
+#else
+  // Print headers
+  if (!outputToCsv) {
+    printf("        |");
+    for (int j = 0; j < numGpus; j++) {
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, j));
+      std::string fullName = prop.gcnArchName;
+      std::string archName = fullName.substr(0, fullName.find(':'));
+      printf(" %6s |", archName.c_str());
+    }
+    printf("\n");
+  }
+
+  printf("        %c", sep);
+  for (int j = 0; j < numGpus; j++)
+    printf(" GPU %02d %c", j, sep);
+  printf(" PCIe Bus ID  %c #CUs %c NUMA %c #DMA %c #XCC\n", sep, sep, sep, sep);
+
+  if (!outputToCsv) {
+    for (int j = 0; j <= numGpus; j++)
+      printf("--------+");
+    printf("--------------+------+------+------+------\n");
+  }
+
+  // Loop over each GPU device
+  for (int i = 0; i < numGpus; i++) {
+    printf(" GPU %02d %c", i, sep);
+
+    // Print off link information
+    for (int j = 0; j < numGpus; j++) {
+      if (i == j) {
+        printf("    N/A %c", sep);
+      } else {
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+        printf(" %s-%d %c",
+               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
+               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
+               hopCount, sep);
+      }
+    }
+
+    char pciBusId[20];
+    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
+    printf(" %11s %c %4d %c %4d %c %4d %c %4d\n",
+           pciBusId, sep,
+           TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}), sep,
+           TransferBench::GetClosestCpuNumaToGpu(i), sep,
+           TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}), sep,
+           TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}));
+  }
+#endif
+}
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
+/*
+Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include <cstring>
+#include <future>
+#include <map>
+#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
+#include <numaif.h>
+#include <set>
+#include <sstream>
+#include <stdarg.h>
+#include <thread>
+#include <vector>
+
+#if defined(__NVCC__)
+#include <cuda_runtime.h>
+#else
+#include <hip/hip_ext.h>
+#include <hip/hip_runtime.h>
+#include <hsa/hsa.h>
+#include <hsa/hsa_ext_amd.h>
+#endif
+
+namespace TransferBench
+{
+  using std::map;
+  using std::pair;
+  using std::set;
+  using std::vector;
+
+  constexpr char VERSION[] = "1.54";
+
+  /**
+   * Enumeration of supported Executor types
+   *
+   * @note The Executor is the device used to perform a Transfer
+   * @note IBVerbs executor is currently not implemented yet
+   */
+  enum ExeType
+  {
+    EXE_CPU          = 0,                       ///<  CPU executor              (subExecutor = CPU thread)
+    EXE_GPU_GFX      = 1,                       ///<  GPU kernel-based executor (subExecutor = threadblock/CU)
+    EXE_GPU_DMA      = 2,                       ///<  GPU SDMA executor         (subExecutor = not supported)
+    EXE_IBV          = 3,                       ///<  IBVerbs executor          (subExecutor = queue pair)
+  };
+  char const ExeTypeStr[5] = "CGDI";
+  inline bool IsCpuExeType(ExeType e){ return e == EXE_CPU; }
+  inline bool IsGpuExeType(ExeType e){ return e == EXE_GPU_GFX || e == EXE_GPU_DMA; }
+
+  /**
+   * A ExeDevice defines a specific Executor
+   */
+  struct ExeDevice
+  {
+    ExeType exeType;                            ///< Executor type
+    int32_t exeIndex;                           ///< Executor index
+
+    // Default comparison operator
+    auto operator<=>(const ExeDevice&) const = default;
+  };
+
+  /**
+   * Enumeration of supported memory types
+   *
+   * @note These are possible types of memory to be used as sources/destinations
+   */
+  enum MemType
+  {
+    MEM_CPU          = 0,                       ///< Coarse-grained pinned CPU memory
+    MEM_GPU          = 1,                       ///< Coarse-grained global GPU memory
+    MEM_CPU_FINE     = 2,                       ///< Fine-grained pinned CPU memory
+    MEM_GPU_FINE     = 3,                       ///< Fine-grained global GPU memory
+    MEM_CPU_UNPINNED = 4,                       ///< Unpinned CPU memory
+    MEM_NULL         = 5,                       ///< NULL memory - used for empty
+    MEM_MANAGED      = 6                        ///< Managed memory
+  };
+  char const MemTypeStr[8] = "CGBFUNM";
+  inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); }
+  inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
+
+  /**
+   * A MemDevice indicates a memory type on a specific device
+   */
+  struct MemDevice
+  {
+    MemType memType;                            ///< Memory type
+    int32_t memIndex;                           ///< Device index
+    auto operator<=>(const MemDevice&) const = default;
+  };
+
+  /**
+   * A Transfer adds together data from zero or more sources then writes the sum to zero or more desintations
+   */
+  struct Transfer
+  {
+    size_t            numBytes    = (1<<26);    ///< # of bytes to Transfer
+    vector<MemDevice> srcs        = {};         ///< List of source memory devices
+    vector<MemDevice> dsts        = {};         ///< List of destination memory devices
+    ExeDevice         exeDevice   = {};         ///< Executor to use
+    int32_t           exeDstIndex = -1;         ///< Destination executor index (for RDMA executor only)
+    int32_t           exeSubIndex = -1;         ///< Executor subindex
+    int               numSubExecs = 0;          ///< Number of subExecutors to use for this Transfer
+  };
+
+  /**
+   * General options
+   */
+  struct GeneralOptions
+  {
+    int numIterations      = 10;                ///< # of timed iterations to perform. If negative, run for -numIterations seconds instead
+    int numSubIterations   = 1;                 ///< # of sub-iterations per iteration
+    int numWarmups         = 3;                 ///< Number of un-timed warmup iterations to perform
+    int recordPerIteration = 0;                 ///< Record per-iteration timing information
+    int useInteractive     = 0;                 ///< Pause for user-input before starting transfer loop
+  };
+
+  /**
+   * Data options
+   */
+  struct DataOptions
+  {
+    int           alwaysValidate   = 0;         ///< Validate after each iteration instead of once at end
+    int           blockBytes       = 256;       ///< Each subexecutor works on a multiple of this many bytes
+    int           byteOffset       = 0;         ///< Byte-offset for memory allocations
+    vector<float> fillPattern      = {};        ///< Pattern of floats used to fill source data
+    int           validateDirect   = 0;         ///< Validate GPU results directly instead of copying to host
+    int           validateSource   = 0;         ///< Validate src GPU memory immediately after preparation
+  };
+
+  /**
+   * DMA Executor options
+   */
+  struct DmaOptions
+  {
+    int useHipEvents = 1;                       ///< Use HIP events for timing DMA Executor
+    int useHsaCopy   = 0;                       ///< Use HSA copy instead of HIP copy to perform DMA
+  };
+
+  /**
+   * GFX Executor options
+   */
+  struct GfxOptions
+  {
+    int                 blockSize      = 256;   ///< Size of each threadblock (must be multiple of 64)
+    vector<uint32_t>    cuMask         = {};    ///< Bit-vector representing the CU mask
+    vector<vector<int>> prefXccTable   = {};    ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
+    int                 unrollFactor   = 4;     ///< GFX-kernel unroll factor
+    int                 useHipEvents   = 1;     ///< Use HIP events for timing GFX Executor
+    int                 useMultiStream = 0;     ///< Use multiple streams for GFX
+    int                 useSingleTeam  = 1;     ///< Team all subExecutors across the data array
+    int                 waveOrder      = 0;     ///< GFX-kernel wavefront ordering
+  };
+
+  /**
+   * Configuration options for performing Transfers
+   */
+  struct ConfigOptions
+  {
+    GeneralOptions general;                     ///< General options
+    DataOptions    data;                        ///< Data options
+
+    GfxOptions     gfx;                         ///< GFX executor options
+    DmaOptions     dma;                         ///< DMA executor options
+  };
+
+  /**
+   * Enumeration of possible error types
+   */
+  enum ErrType
+  {
+    ERR_NONE  = 0,                              ///< No errors
+    ERR_WARN  = 1,                              ///< Warning - results may not be accurate
+    ERR_FATAL = 2,                              ///< Fatal error - results are invalid
+  };
+
+  /**
+   * ErrResult consists of error type and error message
+   */
+  struct ErrResult
+  {
+    ErrType     errType;                        ///< Error type
+    std::string errMsg;                         ///< Error details
+
+    ErrResult() = default;
+#if defined(__NVCC__)
+    ErrResult(cudaError_t  err);
+#else
+    ErrResult(hipError_t   err);
+    ErrResult(hsa_status_t err);
+#endif
+    ErrResult(ErrType      err);
+    ErrResult(ErrType      errType, const char* format, ...);
+  };
+
+  /**
+   * Results for a single Executor
+   */
+  struct ExeResult
+  {
+    size_t      numBytes;                       ///< Total bytes transferred by this Executor
+    double      avgDurationMsec;                ///< Averaged duration for all the Transfers for this Executor
+    double      avgBandwidthGbPerSec;           ///< Average bandwidth for this Executor
+    double      sumBandwidthGbPerSec;           ///< Naive sum of individual Transfer average bandwidths
+    vector<int> transferIdx;                    ///< Indicies of Transfers this Executor executed
+  };
+
+  /**
+   * Results for a single Transfer
+   */
+  struct TransferResult
+  {
+    size_t numBytes;                            ///< Number of bytes transferred by this Transfer
+    double avgDurationMsec;                     ///< Duration for this Transfer, averaged over all timed iterations
+    double avgBandwidthGbPerSec;                ///< Bandwidth for this Transfer based on averaged duration
+
+    // Only filled in if recordPerIteration = 1
+    vector<double> perIterMsec;                 ///< Duration for each individual iteration
+    vector<set<pair<int,int>>> perIterCUs;      ///< GFX-Executor only. XCC:CU used per iteration
+  };
+
+  /**
+   * TestResults contain timing results for a set of Transfers as a group as well as per Executor and per Transfer
+   * timing information
+   */
+  struct TestResults
+  {
+    int    numTimedIterations;                  ///< Number of iterations executed
+    size_t totalBytesTransferred;               ///< Total bytes transferred per iteration
+    double avgTotalDurationMsec;                ///< Wall-time (msec) to finish all Transfers (averaged across all timed iterations)
+    double avgTotalBandwidthGbPerSec;           ///< Bandwidth based on all Transfers and average wall time
+    double overheadMsec;                        ///< Difference between total wall time and slowest executor
+
+    map<ExeDevice, ExeResult> exeResults;       ///< Per Executor results
+    vector<TransferResult>    tfrResults;       ///< Per Transfer results
+    vector<ErrResult>         errResults;       ///< List of any errors/warnings that occurred
+  };
+
+  /**
+   * Run a set of Transfers
+   *
+   * @param[in]  config     Configuration options
+   * @param[in]  transfers  Set of Transfers to execute
+   * @param[out] results    Timing results
+   * @returns true if and only if Transfers were run successfully without any fatal errors
+   */
+  bool RunTransfers(ConfigOptions    const& config,
+                    vector<Transfer> const& transfers,
+                    TestResults&            results);
+
+  /**
+   * Enumeration of implementation attributes
+   */
+  enum IntAttribute
+  {
+    ATR_GFX_MAX_BLOCKSIZE,                      ///< Maximum blocksize for GFX executor
+    ATR_GFX_MAX_UNROLL,                         ///< Maximum unroll factor for GFX executor
+  };
+
+  enum StrAttribute
+  {
+    ATR_SRC_PREP_DESCRIPTION                    ///< Description of how source memory is prepared
+  };
+
+  /**
+   * Query attributes (integer)
+   *
+   * @note This allows querying of implementation information such as limits
+   *
+   * @param[in] attribute   Attribute to query
+   * @returns Value of the attribute
+   */
+  int GetIntAttribute(IntAttribute attribute);
+
+  /**
+   * Query attributes (string)
+   *
+   * @note This allows query of implementation details such as limits
+   *
+   * @param[in] attrtibute Attribute to query
+   * @returns Value of the attribute
+   */
+  std::string GetStrAttribute(StrAttribute attribute);
+
+  /**
+   * Returns information about number of available available Executors
+   *
+   * @param[in] exeType    Executor type to query
+   * @returns Number of detected Executors of exeType
+   */
+  int GetNumExecutors(ExeType exeType);
+
+  /**
+   * Returns the number of possible Executor subindices
+   *
+   * @note For CPU, this is 0
+   * @note For GFX, this refers to the number of XCDs
+   * @note For DMA, this refers to the number of DMA engines
+   *
+   * @param[in] exeDevice The specific Executor to query
+   * @returns Number of detected executor subindices
+   */
+  int GetNumExecutorSubIndices(ExeDevice exeDevice);
+
+  /**
+   * Returns number of subExecutors for a given ExeDevice
+   *
+   * @param[in] exeDevice   The specific Executor to query
+   * @returns Number of detected subExecutors for the given ExePair
+   */
+  int GetNumSubExecutors(ExeDevice exeDevice);
+
+  /**
+   * Returns the index of the NUMA node closest to the given GPU
+   *
+   * @param[in] gpuIndex Index of the GPU to query
+   * @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect
+   */
+  int GetClosestCpuNumaToGpu(int gpuIndex);
+
+  /**
+   * Helper function to parse a line containing Transfers into a vector of Transfers
+   *
+   * @param[in]  str       String containing description of Transfers
+   * @param[out] transfers List of Transfers described by 'str'
+   * @returns Information about any error that may have occured
+   */
+  ErrResult ParseTransfers(std::string str,
+                           std::vector<Transfer>& transfers);
+
+};
+//==========================================================================================
+// End of TransferBench API
+//==========================================================================================
+
+// Redefinitions for CUDA compatibility
+//==========================================================================================
+#if defined(__NVCC__)
+
+  // ROCm specific
+  #define wall_clock64                                       clock64
+  #define gcnArchName                                        name
+
+  // Datatypes
+  #define hipDeviceProp_t                                    cudaDeviceProp
+  #define hipError_t                                         cudaError_t
+  #define hipEvent_t                                         cudaEvent_t
+  #define hipStream_t                                        cudaStream_t
+
+  // Enumerations
+  #define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
+  #define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
+  #define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
+  #define hipFuncCachePreferShared                           cudaFuncCachePreferShared
+  #define hipMemcpyDefault                                   cudaMemcpyDefault
+  #define hipMemcpyDeviceToHost                              cudaMemcpyDeviceToHost
+  #define hipMemcpyHostToDevice                              cudaMemcpyHostToDevice
+  #define hipSuccess                                         cudaSuccess
+
+  // Functions
+  #define hipDeviceCanAccessPeer                             cudaDeviceCanAccessPeer
+  #define hipDeviceEnablePeerAccess                          cudaDeviceEnablePeerAccess
+  #define hipDeviceGetAttribute                              cudaDeviceGetAttribute
+  #define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
+  #define hipDeviceSetCacheConfig                            cudaDeviceSetCacheConfig
+  #define hipDeviceSynchronize                               cudaDeviceSynchronize
+  #define hipEventCreate                                     cudaEventCreate
+  #define hipEventDestroy                                    cudaEventDestroy
+  #define hipEventElapsedTime                                cudaEventElapsedTime
+  #define hipEventRecord                                     cudaEventRecord
+  #define hipFree                                            cudaFree
+  #define hipGetDeviceCount                                  cudaGetDeviceCount
+  #define hipGetDeviceProperties                             cudaGetDeviceProperties
+  #define hipGetErrorString                                  cudaGetErrorString
+  #define hipHostFree                                        cudaFreeHost
+  #define hipHostMalloc                                      cudaMallocHost
+  #define hipMalloc                                          cudaMalloc
+  #define hipMallocManaged                                   cudaMallocManaged
+  #define hipMemcpy                                          cudaMemcpy
+  #define hipMemcpyAsync                                     cudaMemcpyAsync
+  #define hipMemset                                          cudaMemset
+  #define hipMemsetAsync                                     cudaMemsetAsync
+  #define hipSetDevice                                       cudaSetDevice
+  #define hipStreamCreate                                    cudaStreamCreate
+  #define hipStreamDestroy                                   cudaStreamDestroy
+  #define hipStreamSynchronize                               cudaStreamSynchronize
+
+  // Define float4 addition operator for NVIDIA platform
+  __device__ inline float4& operator +=(float4& a, const float4& b)
+  {
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+    return a;
+  }
+#endif
+
+// Helper macro functions
+//==========================================================================================
+
+// Macro for collecting CU/SM GFX kernel is running on
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
+#define GetHwId(hwId) hwId = 0
+#elif defined(__NVCC__)
+#define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
+#else
+#define GetHwId(hwId) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId));
+#endif
+
+// Macro for collecting XCC GFX kernel is running on
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
+#else
+#define GetXccId(val) val = 0
+#endif
+
+// Error check macro (NOTE: This will return even for ERR_WARN)
+#define ERR_CHECK(cmd)            \
+  do {                            \
+    ErrResult err = (cmd);        \
+    if (err.errType != ERR_NONE)  \
+      return err;                 \
+  } while (0)
+
+// Appends warn/fatal errors to a list, return false if fatal
+#define ERR_APPEND(cmd, list)     \
+  do {                            \
+    ErrResult err = (cmd);        \
+    if (err.errType != ERR_NONE)  \
+      list.push_back(err);        \
+    if (err.errType == ERR_FATAL) \
+      return false;               \
+  } while (0)
+
+namespace TransferBench
+{
+// Helper functions ('hidden' in anonymous namespace)
+//========================================================================================
+namespace {
+
+// Constants
+//========================================================================================
+  int   constexpr MAX_BLOCKSIZE  = 512;                       // Max threadblock size
+  int   constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64;        // Max wavegroups/warps
+  int   constexpr MAX_UNROLL     = 8;                         // Max unroll factor
+  int   constexpr MAX_SRCS       = 8;                         // Max # srcs per Transfer
+  int   constexpr MAX_DSTS       = 8;                         // Max # dsts per Transfer
+  int   constexpr MEMSET_CHAR    = 75;                        // Value to memset (char)
+  float constexpr MEMSET_VAL     = 13323083.0f;               // Value to memset (double)
+
+// Parsing-related functions
+//========================================================================================
+
+  static ErrResult CharToMemType(char const c, MemType& memType)
+  {
+    char const* val = strchr(MemTypeStr, toupper(c));
+    if (val) {
+      memType = (MemType)(val - MemTypeStr);
+      return ERR_NONE;
+    }
+    return {ERR_FATAL, "Unexpected memory type (%c)", c};
+  }
+
+  static ErrResult CharToExeType(char const c, ExeType& exeType)
+  {
+    char const* val = strchr(ExeTypeStr, toupper(c));
+    if (val) {
+      exeType = (ExeType)(val - ExeTypeStr);
+      return ERR_NONE;
+    }
+    return {ERR_FATAL, "Unexpected executor type (%c)", c};
+  }
+
+  static ErrResult ParseMemType(std::string const& token,
+                                std::vector<MemDevice>& memDevices)
+  {
+    char memTypeChar;
+    int offset = 0, memIndex, inc;
+    MemType memType;
+    bool found = false;
+
+    memDevices.clear();
+    while (sscanf(token.c_str() + offset, " %c %d%n", &memTypeChar, &memIndex, &inc) == 2) {
+      offset += inc;
+
+      ErrResult err = CharToMemType(memTypeChar, memType);
+      if (err.errType != ERR_NONE) return err;
+
+      if (memType != MEM_NULL)
+        memDevices.push_back({memType, memIndex});
+      found = true;
+    }
+    if (found) return ERR_NONE;
+    return {ERR_FATAL,
+            "Unable to parse memory type token %s.  Expected one of %s followed by an index",
+            token.c_str(), MemTypeStr};
+  }
+
+  static ErrResult ParseExeType(std::string const& token,
+                                ExeDevice& exeDevice,
+                                int& exeSubIndex)
+  {
+    char exeTypeChar;
+    exeSubIndex = -1;
+
+    int numTokensParsed = sscanf(token.c_str(),
+                                 " %c%d.%d", &exeTypeChar, &exeDevice.exeIndex, &exeSubIndex);
+    if (numTokensParsed < 2) {
+      return {ERR_FATAL,
+              "Unable to parse valid executor token (%s)."
+              "Expected one of %s followed by an index",
+              token.c_str(), ExeTypeStr};
+    }
+    return CharToExeType(exeTypeChar, exeDevice.exeType);
+  }
+
+// Memory-related functions
+//========================================================================================
+  // Enable peer access between two GPUs
+  static ErrResult EnablePeerAccess(int const deviceId, int const peerDeviceId)
+  {
+    int canAccess;
+    ERR_CHECK(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
+    if (!canAccess)
+      return {ERR_FATAL,
+              "Unable to enable peer access from GPU devices %d to %d", peerDeviceId, deviceId};
+
+    ERR_CHECK(hipSetDevice(deviceId));
+    hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
+    if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled) {
+      return {ERR_FATAL,
+              "Unable to enable peer to peer access from %d to %d (%s)",
+              deviceId, peerDeviceId, hipGetErrorString(error)};
+    }
+    return ERR_NONE;
+  }
+
+  // Check that CPU memory array of numBytes has been allocated on targetId NUMA node
+  static ErrResult CheckPages(char* array, size_t numBytes, int targetId)
+  {
+    size_t const pageSize = getpagesize();
+    size_t const numPages = (numBytes + pageSize - 1) / pageSize;
+
+    std::vector<void *> pages(numPages);
+    std::vector<int> status(numPages);
+
+    pages[0] = array;
+    for (int i = 1; i < numPages; i++) {
+      pages[i] = (char*)pages[i-1] + pageSize;
+    }
+
+    long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
+    if (retCode)
+      return {ERR_FATAL,
+              "Unable to collect page table information for allocated memory. "
+              "Ensure NUMA library is installed properly"};
+
+    size_t mistakeCount = 0;
+    for (size_t i = 0; i < numPages; i++) {
+      if (status[i] < 0)
+        return {ERR_FATAL,
+                "Unexpected page status (%d) for page %llu", status[i], i};
+      if (status[i] != targetId) mistakeCount++;
+    }
+    if (mistakeCount > 0) {
+      return {ERR_FATAL,
+              "%lu out of %lu pages for memory allocation were not on NUMA node %d."
+              " This could be due to hardware memory issues",
+              mistakeCount, numPages, targetId};
+    }
+    return ERR_NONE;
+  }
+
+  // Allocate memory
+  static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr)
+  {
+    if (numBytes == 0) {
+      return {ERR_FATAL, "Unable to allocate 0 bytes"};
+    }
+    *memPtr = nullptr;
+
+    MemType const& memType = memDevice.memType;
+
+    if (IsCpuMemType(memType)) {
+      // Set numa policy prior to call to hipHostMalloc
+      numa_set_preferred(memDevice.memIndex);
+
+      // Allocate host-pinned memory (should respect NUMA mem policy)
+      if (memType == MEM_CPU_FINE) {
+#if defined (__NVCC__)
+        return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"};
+#else
+        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+#endif
+      } else if (memType == MEM_CPU) {
+#if defined (__NVCC__)
+        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, 0));
+#else
+        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent));
+#endif
+      } else if (memType == MEM_CPU_UNPINNED) {
+        *memPtr = numa_alloc_onnode(numBytes, memDevice.memIndex);
+      }
+
+      // Check that the allocated pages are actually on the correct NUMA node
+      memset(*memPtr, 0, numBytes);
+      ERR_CHECK(CheckPages((char*)*memPtr, numBytes, memDevice.memIndex));
+
+      // Reset to default numa mem policy
+      numa_set_preferred(-1);
+    } else if (IsGpuMemType(memType)) {
+      // Switch to the appropriate GPU
+      ERR_CHECK(hipSetDevice(memDevice.memIndex));
+
+      if (memType == MEM_GPU) {
+        // Allocate GPU memory on appropriate device
+        ERR_CHECK(hipMalloc((void**)memPtr, numBytes));
+      } else if (memType == MEM_GPU_FINE) {
+#if defined (__NVCC__)
+        return {ERR_FATAL, "Fine-grained GPU memory not supported on NVIDIA platform"};
+#else
+        int flag = hipDeviceMallocUncached;
+        ERR_CHECK(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
+#endif
+      } else if (memType == MEM_MANAGED) {
+        ERR_CHECK(hipMallocManaged((void**)memPtr, numBytes));
+      }
+
+      // Clear the memory
+      ERR_CHECK(hipMemset(*memPtr, 0, numBytes));
+      ERR_CHECK(hipDeviceSynchronize());
+    } else {
+      return {ERR_FATAL, "Unsupported memory type (%d)", memType};
+    }
+    return ERR_NONE;
+  }
+
+  // Deallocate memory
+  static ErrResult DeallocateMemory(MemType memType, void *memPtr, size_t const bytes)
+  {
+    // Avoid deallocating nullptr
+    if (memPtr == nullptr)
+      return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};
+
+    switch (memType) {
+    case MEM_CPU: case MEM_CPU_FINE:
+    {
+      ERR_CHECK(hipHostFree(memPtr));
+      break;
+    }
+    case MEM_CPU_UNPINNED:
+    {
+      numa_free(memPtr, bytes);
+      break;
+    }
+    case MEM_GPU : case MEM_GPU_FINE: case MEM_MANAGED:
+    {
+      ERR_CHECK(hipFree(memPtr));
+      break;
+    }
+    default:
+      return {ERR_FATAL, "Attempting to deallocate unrecognized memory type (%d)", memType};
+    }
+    return ERR_NONE;
+  }
+
+// HSA-related functions
+//========================================================================================
+
+#if !defined(__NVCC__)
+  // Get the hsa_agent_t associated with a ExeDevice
+  static ErrResult GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent)
+  {
+    static bool isInitialized = false;
+    static std::vector<hsa_agent_t> cpuAgents;
+    static std::vector<hsa_agent_t> gpuAgents;
+
+    int const& exeIndex = exeDevice.exeIndex;
+    int const numCpus   = GetNumExecutors(EXE_CPU);
+    int const numGpus   = GetNumExecutors(EXE_GPU_GFX);
+
+    // Initialize results on first use
+    if (!isInitialized) {
+      hsa_amd_pointer_info_t info;
+      info.size = sizeof(info);
+
+      ErrResult err;
+      int32_t* tempBuffer;
+
+      // Index CPU agents
+      cpuAgents.clear();
+      for (int i = 0; i < numCpus; i++) {
+        ERR_CHECK(AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer));
+        ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
+        cpuAgents.push_back(info.agentOwner);
+        ERR_CHECK(DeallocateMemory(MEM_CPU, tempBuffer, 1024));
+      }
+
+      // Index GPU agents
+      gpuAgents.clear();
+      for (int i = 0; i < numGpus; i++) {
+        ERR_CHECK(AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer));
+        ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
+        gpuAgents.push_back(info.agentOwner);
+        ERR_CHECK(DeallocateMemory(MEM_GPU, tempBuffer, 1024));
+      }
+      isInitialized = true;
+    }
+
+    switch (exeDevice.exeType) {
+    case EXE_CPU:
+      if (exeIndex < 0 || exeIndex >= numCpus)
+        return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1};
+      agent = cpuAgents[exeDevice.exeIndex];
+      break;
+    case EXE_GPU_GFX: case EXE_GPU_DMA:
+      if (exeIndex < 0 || exeIndex >= numGpus)
+        return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
+      agent = gpuAgents[exeIndex];
+      break;
+    default:
+      return {ERR_FATAL,
+              "Attempting to get HSA agent of unknown or unsupported executor type (%d)",
+              exeDevice.exeType};
+    }
+    return ERR_NONE;
+  }
+
+  // Get the hsa_agent_t associated with a MemDevice
+  static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
+  {
+    if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
+    if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
+    return {ERR_FATAL,
+            "Unable to get HSA agent for memDevice (%d,%d)",
+            memDevice.memType, memDevice.memIndex};
+  }
+#endif
+
+// Setup validation-related functions
+//========================================================================================
+
+  // Validate that MemDevice exists
+  static ErrResult CheckMemDevice(MemDevice const& memDevice)
+  {
+    if (memDevice.memType == MEM_NULL)
+      return ERR_NONE;
+
+    if (IsCpuMemType(memDevice.memType)) {
+      int numCpus = GetNumExecutors(EXE_CPU);
+      if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus)
+        return {ERR_FATAL,
+                "CPU index must be between 0 and %d (instead of %d)", numCpus - 1, memDevice.memIndex};
+      return ERR_NONE;
+    }
+
+    if (IsGpuMemType(memDevice.memType)) {
+    int numGpus = GetNumExecutors(EXE_GPU_GFX);
+      if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus)
+        return {ERR_FATAL,
+                "GPU index must be between 0 and %d (instead of %d)", numGpus - 1, memDevice.memIndex};
+      return ERR_NONE;
+    }
+    return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType};
+  }
+
+  // Validate configuration options - return trues if and only if an fatal error is detected
+  static bool ConfigOptionsHaveErrors(ConfigOptions const&    cfg,
+                                      std::vector<ErrResult>& errors)
+  {
+    // Check general options
+    if (cfg.general.numWarmups < 0)
+      errors.push_back({ERR_FATAL, "[general.numWarmups] must be a non-negative number"});
+
+    // Check data options
+    if (cfg.data.blockBytes == 0 || cfg.data.blockBytes % 4)
+      errors.push_back({ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)});
+    if (cfg.data.byteOffset < 0 || cfg.data.byteOffset % sizeof(float))
+      errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
+
+    // Check GFX options
+    int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE);
+    if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize)
+      errors.push_back({ERR_FATAL,
+                        "[gfx.blockSize] must be positive multiple of 64 less than or equal to %d",
+                        gfxMaxBlockSize});
+
+    int gfxMaxUnroll = GetIntAttribute(ATR_GFX_MAX_UNROLL);
+    if (cfg.gfx.unrollFactor < 0 || cfg.gfx.unrollFactor > gfxMaxUnroll)
+      errors.push_back({ERR_FATAL,
+                        "[gfx.unrollFactor] must be non-negative and less than or equal to %d",
+                        gfxMaxUnroll});
+    if (cfg.gfx.waveOrder < 0 || cfg.gfx.waveOrder >= 6)
+      errors.push_back({ERR_FATAL,
+                        "[gfx.waveOrder] must be non-negative and less than 6"});
+
+    int numGpus = GetNumExecutors(EXE_GPU_GFX);
+    int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
+    vector<vector<int>> const& table = cfg.gfx.prefXccTable;
+
+    if (!table.empty()) {
+      if (table.size() != numGpus) {
+        errors.push_back({ERR_FATAL, "[gfx.prefXccTable] must be have size %dx%d", numGpus, numGpus});
+      } else {
+        for (int i = 0; i < table.size(); i++) {
+          if (table[i].size() != numGpus) {
+            errors.push_back({ERR_FATAL, "[gfx.prefXccTable] must be have size %dx%d", numGpus, numGpus});
+            break;
+          } else {
+            for (auto x : table[i]) {
+              if (x < 0 || x >= numXccs) {
+                errors.push_back({ERR_FATAL, "[gfx.prefXccTable] must contain values between 0 and %d",
+                    numXccs - 1});
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // NVIDIA specific
+#if defined(__NVCC__)
+    if (cfg.data.validateDirect)
+      errors.push_back({ERR_FATAL, "[data.validateDirect] is not supported on NVIDIA hardware"});
+#else
+    // AMD specific
+    // Check for largeBar enablement on GPUs
+    for (int i = 0; i < numGpus; i++) {
+      int isLargeBar = 0;
+      hipError_t err = hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, i);
+      if (err != hipSuccess) {
+        errors.push_back({ERR_FATAL, "Unable to query if GPU %d has largeBAR enabled", i});
+      } else if (!isLargeBar) {
+        errors.push_back({ERR_WARN,
+                          "Large BAR is not enabled for GPU %d in BIOS. "
+                          "Large BAR is required to enable multi-gpu data access", i});
+      }
+    }
+#endif
+
+    // Check for fatal errors
+    for (auto const& err : errors)
+      if (err.errType == ERR_FATAL) return true;
+    return false;
+  }
+
+  // Validate Transfers to execute - returns true if and only if fatal error detected
+  static bool TransfersHaveErrors(ConfigOptions         const& cfg,
+                                  std::vector<Transfer> const& transfers,
+                                  std::vector<ErrResult>&      errors)
+  {
+    int numCpus = GetNumExecutors(EXE_CPU);
+    int numGpus = GetNumExecutors(EXE_GPU_GFX);
+
+    std::set<ExeDevice>      executors;
+    std::map<ExeDevice, int> transferCount;
+    std::map<ExeDevice, int> useSubIndexCount;
+    std::map<ExeDevice, int> totalSubExecs;
+
+    // Per-Transfer checks
+    for (size_t i = 0; i < transfers.size(); i++) {
+      Transfer const& t = transfers[i];
+
+      if (t.numBytes == 0)
+        errors.push_back({ERR_FATAL, "Transfer %d: Cannot perform 0-byte transfers", i});
+
+      if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) {
+        size_t const N               = t.numBytes / sizeof(float);
+        int    const targetMultiple  = cfg.data.blockBytes / sizeof(float);
+        int    const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
+                                                (size_t)t.numSubExecs);
+
+        if (maxSubExecToUse < t.numSubExecs)
+          errors.push_back({ERR_WARN,
+                            "Transfer %d data size is too small - will only use %d of %d subexecutors",
+                            i, maxSubExecToUse, t.numSubExecs});
+      }
+
+      // Check sources and destinations
+      if (t.srcs.empty() && t.dsts.empty())
+        errors.push_back({ERR_FATAL, "Transfer %d: Must have at least one source or destination", i});
+
+      for (int j = 0; j < t.srcs.size(); j++) {
+        ErrResult err = CheckMemDevice(t.srcs[j]);
+        if (err.errType != ERR_NONE)
+          errors.push_back({ERR_FATAL, "Transfer %d: SRC %d: %s", i, j, err.errMsg.c_str()});
+      }
+      for (int j = 0; j < t.dsts.size(); j++) {
+        ErrResult err = CheckMemDevice(t.dsts[j]);
+        if (err.errType != ERR_NONE)
+          errors.push_back({ERR_FATAL, "Transfer %d: DST %d: %s", i, j, err.errMsg.c_str()});
+      }
+
+      // Check executor
+      executors.insert(t.exeDevice);
+      transferCount[t.exeDevice]++;
+      switch (t.exeDevice.exeType) {
+      case EXE_CPU:
+        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numCpus)
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: CPU index must be between 0 and %d (instead of %d)",
+                            i, numCpus - 1, t.exeDevice.exeIndex});
+        break;
+      case EXE_GPU_GFX:
+        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: GFX index must be between 0 and %d (instead of %d)",
+                            i, numGpus - 1, t.exeDevice.exeIndex});
+        } else {
+          if (t.exeSubIndex != -1) {
+#if defined(__NVCC__)
+            errors.push_back({ERR_FATAL,
+                              "Transfer %d: GFX executor subindex not supported on NVIDIA hardware", i});
+#else
+            useSubIndexCount[t.exeDevice]++;
+            int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
+            if (t.exeSubIndex >= numSubIndices)
+              errors.push_back({ERR_FATAL,
+                                "Transfer %d: GFX subIndex (XCC) must be between 0 and %d", i, numSubIndices - 1});
+#endif
+          }
+        }
+        break;
+      case EXE_GPU_DMA:
+        if (t.srcs.size() != 1 || t.dsts.size() != 1) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: DMA executor must have exactly 1 source and 1 destination", i});
+        }
+
+        if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) {
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: DMA index must be between 0 and %d (instead of %d)",
+                            i, numGpus - 1, t.exeDevice.exeIndex});
+          // Cannot proceed with any further checks
+          continue;
+        }
+
+        if (t.exeSubIndex != -1) {
+#if defined(__NVCC__)
+          errors.push_back({ERR_FATAL,
+                            "Transfer %d: DMA executor subindex not supported on NVIDIA hardware", i});
+#else
+          useSubIndexCount[t.exeDevice]++;
+          int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
+          if (t.exeSubIndex >= numSubIndices)
+            errors.push_back({ERR_FATAL,
+                              "Transfer %d: DMA subIndex (engine) must be between 0 and %d",
+                              i, numSubIndices - 1});
+
+          // Check that engine Id exists between agents
+          hsa_agent_t srcAgent, dstAgent;
+          ErrResult err;
+          err = GetHsaAgent(t.srcs[0], srcAgent);
+          if (err.errType != ERR_NONE) {
+            errors.push_back(err);
+            if (err.errType == ERR_FATAL) break;
+          }
+          err = GetHsaAgent(t.dsts[0], dstAgent);
+          if (err.errType != ERR_NONE) {
+            errors.push_back(err);
+            if (err.errType == ERR_FATAL) break;
+          }
+
+          uint32_t engineIdMask = 0;
+          err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
+          if (err.errType != ERR_NONE) {
+            errors.push_back(err);
+            if (err.errType == ERR_FATAL) break;
+          }
+          hsa_amd_sdma_engine_id_t sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << t.exeSubIndex);
+          if (!(sdmaEngineId & engineIdMask)) {
+            errors.push_back({ERR_FATAL,
+                "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
+                i, t.exeDevice.exeIndex, t.exeSubIndex});
+          }
+#endif
+        }
+
+        if (!IsGpuMemType(t.srcs[0].memType) || !IsGpuMemType(t.dsts[0].memType)) {
+          errors.push_back({ERR_WARN,
+              "Transfer %d: No GPU memory for source or destination.  Copy might not execute on DMA %d",
+              i, t.exeDevice.exeIndex});
+        } else {
+          // Currently HIP will use src agent if source memory is GPU, otherwise dst agent
+          if (IsGpuMemType(t.srcs[0].memType)) {
+            if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
+              errors.push_back({ERR_WARN,
+                  "Transfer %d: DMA executor will automatically switch to using the source memory device (%d) not (%d)",
+                  i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
+            }
+          } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
+            errors.push_back({ERR_WARN,
+                "Transfer %d: DMA executor will automatically switch to using the destination memory device (%d) not (%d)",
+                i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
+          }
+        }
+        break;
+      case EXE_IBV:
+        errors.push_back({ERR_FATAL, "Transfer %d: IBV executor currently not supported", i});
+        break;
+      }
+
+      // Check subexecutors
+      if (t.numSubExecs <= 0)
+        errors.push_back({ERR_FATAL, "Transfer %d: # of subexecutors must be positive", i});
+      else
+        totalSubExecs[t.exeDevice] += t.numSubExecs;
+    }
+
+    int gpuMaxHwQueues = 4;
+    if (getenv("GPU_MAX_HW_QUEUES"))
+      gpuMaxHwQueues = atoi(getenv("GPU_MAX_HW_QUEUES"));
+
+    // Aggregate checks
+    for (auto const& exeDevice : executors) {
+      switch (exeDevice.exeType) {
+      case EXE_CPU:
+      {
+        // Check total number of subexecutors requested
+        int numCpuSubExec = GetNumSubExecutors(exeDevice);
+        if (totalSubExecs[exeDevice] > numCpuSubExec)
+          errors.push_back({ERR_WARN,
+                            "CPU %d requests %d total cores however only %d available. "
+                            "Serialization will occur",
+                            exeDevice.exeIndex, totalSubExecs[exeDevice], numCpuSubExec});
+        break;
+      }
+      case EXE_GPU_GFX:
+      {
+        // Check total number of subexecutors requested
+        int numGpuSubExec = GetNumSubExecutors(exeDevice);
+        if (totalSubExecs[exeDevice] > numGpuSubExec)
+          errors.push_back({ERR_WARN,
+                            "GPU %d requests %d total CUs however only %d available. "
+                            "Serialization will occur",
+                            exeDevice.exeIndex, totalSubExecs[exeDevice], numGpuSubExec});
+        // Check that if executor subindices are used, all Transfers specify executor subindices
+        if (useSubIndexCount[exeDevice] > 0 && useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
+          errors.push_back({ERR_FATAL,
+                            "GPU %d specifies XCC on only %d of %d Transfers. "
+                            "Must either specific none or all",
+                            exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]});
+        }
+
+        if (cfg.gfx.useMultiStream && transferCount[exeDevice] > gpuMaxHwQueues) {
+          errors.push_back({ERR_WARN,
+                            "GPU %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d",
+                            exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues});
+        }
+        break;
+      }
+      case EXE_GPU_DMA:
+      {
+        // Check that if executor subindices are used, all Transfers specify executor subindices
+        if (useSubIndexCount[exeDevice] > 0 && useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
+          errors.push_back({ERR_FATAL,
+                            "DMA %d specifies engine on only %d of %d Transfers. "
+                            "Must either specific none or all",
+                            exeDevice.exeIndex, useSubIndexCount[exeDevice], transferCount[exeDevice]});
+        }
+        if (transferCount[exeDevice] > gpuMaxHwQueues) {
+          errors.push_back({ERR_WARN,
+                           "DMA %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d",
+                           exeDevice.exeIndex, transferCount[exeDevice], gpuMaxHwQueues});
+        }
+
+        char* enableSdma = getenv("HSA_ENABLE_SDMA");
+        if (enableSdma && !strcmp(enableSdma, "0"))
+          errors.push_back({ERR_WARN,
+                            "DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0. "
+                            "DMA %d copies will fallback to blit (GFX) kernels", exeDevice.exeIndex});
+        break;
+      }
+      default:
+        break;
+      }
+    }
+
+
+    // Check for fatal errors
+    for (auto const& err : errors)
+      if (err.errType == ERR_FATAL) return true;
+    return false;
+  }
+
+// Internal data structures
+//========================================================================================
+
+  // Parameters for each SubExecutor
+  struct SubExecParam
+  {
+    // Inputs
+    size_t                     N;                 ///< Number of floats this subExecutor works on
+    int                        numSrcs;           ///< Number of source arrays
+    int                        numDsts;           ///< Number of destination arrays
+    float*                     src[MAX_SRCS];     ///< Source array pointers
+    float*                     dst[MAX_DSTS];     ///< Destination array pointers
+    int32_t                    preferredXccId;    ///< XCC ID to execute on (GFX only)
+
+    // Prepared
+    int                        teamSize;          ///< Index of this sub executor amongst team
+    int                        teamIdx;           ///< Size of team this sub executor is part of
+
+    // Outputs
+    long long                  startCycle;        ///< Start timestamp for in-kernel timing (GPU-GFX executor)
+    long long                  stopCycle;         ///< Stop  timestamp for in-kernel timing (GPU-GFX executor)
+    uint32_t                   hwId;              ///< Hardware ID
+    uint32_t                   xccId;             ///< XCC ID
+  };
+
+  // Internal resources allocated per Transfer
+  struct TransferResources
+  {
+    int                        transferIdx;       ///< The associated Transfer
+    size_t                     numBytes;          ///< Number of bytes to Transfer
+    vector<float*>             srcMem;            ///< Source memory
+    vector<float*>             dstMem;            ///< Destination memory
+    vector<SubExecParam>       subExecParamCpu;   ///< Defines subarrays for each subexecutor
+    vector<int>                subExecIdx;        ///< Indices into subExecParamGpu
+
+    // For GFX executor
+    SubExecParam*              subExecParamGpuPtr;
+
+    // For targeted-SDMA
+#if !defined(__NVCC__)
+    hsa_agent_t                dstAgent;          ///< DMA destination memory agent
+    hsa_agent_t                srcAgent;          ///< DMA source memory agent
+    hsa_signal_t               signal;            ///< HSA signal for completion
+    hsa_amd_sdma_engine_id_t   sdmaEngineId;      ///< DMA engine ID
+#endif
+
+    // Counters
+    double                     totalDurationMsec; ///< Total duration for all iterations for this Transfer
+    vector<double>             perIterMsec;       ///< Duration for each individual iteration
+    vector<set<pair<int,int>>> perIterCUs;        ///< GFX-Executor only. XCC:CU used per iteration
+  };
+
+  // Internal resources allocated per Executor
+  struct ExeInfo
+  {
+    size_t                     totalBytes;        ///< Total bytes this executor transfers
+    double                     totalDurationMsec; ///< Total duration for all iterations for this Executor
+    int                        totalSubExecs;     ///< Total number of subExecutors to use
+    bool                       useSubIndices;     ///< Use subexecutor indicies
+    int                        numSubIndices;     ///< Number of subindices this ExeDevice has
+    int                        wallClockRate;     ///< (GFX-only) Device wall clock rate
+    vector<SubExecParam>       subExecParamCpu;   ///< Subexecutor parameters for this executor
+    vector<TransferResources>  resources;         ///< Per-Transfer resources
+
+    // For GPU-Executors
+    SubExecParam*              subExecParamGpu;   ///< GPU copy of subExecutor parameters
+    vector<hipStream_t>        streams;           ///< HIP streams to launch on
+    vector<hipEvent_t>         startEvents;       ///< HIP start timing event
+    vector<hipEvent_t>         stopEvents;        ///< HIP stop timing event
+  };
+
+// Data validation-related functions
+//========================================================================================
+
+  // Pseudo-random formula for each element in array
+  static __host__ float PrepSrcValue(int srcBufferIdx, size_t idx)
+  {
+    return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
+  }
+
+  // Fills a pre-sized buffer with the pattern, based on which src index buffer
+  // Note: Can also generate expected dst buffer
+  static void PrepareReference(ConfigOptions const& cfg, std::vector<float>& cpuBuffer, int bufferIdx)
+  {
+    size_t N = cpuBuffer.size();
+
+    // Source buffer
+    if (bufferIdx >= 0) {
+      // Use fill pattern if specified
+      size_t patternLen = cfg.data.fillPattern.size();
+      if (patternLen > 0) {
+        size_t copies   = N / patternLen;
+        size_t leftOver = N % patternLen;
+        float* cpuBufferPtr = cpuBuffer.data();
+        for (int i = 0; i < copies; i++) {
+          memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), patternLen * sizeof(float));
+          cpuBufferPtr += patternLen;
+        }
+        if (leftOver)
+          memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), leftOver * sizeof(float));
+      } else {
+        for (size_t i = 0; i < N; ++i)
+          cpuBuffer[i] = PrepSrcValue(bufferIdx, i);
+      }
+    } else { // Destination buffer
+      int numSrcs = -bufferIdx - 1;
+
+      if (numSrcs == 0) {
+        // Note: 0x75757575 = 13323083.0
+        memset(cpuBuffer.data(), MEMSET_CHAR, N * sizeof(float));
+      } else {
+        PrepareReference(cfg, cpuBuffer, 0);
+        if (numSrcs > 1) {
+          std::vector<float> temp(N);
+          for (int i = 1; i < numSrcs; i++) {
+            PrepareReference(cfg, temp, i);
+            for (int j = 0; j < N; j++) {
+              cpuBuffer[i] += temp[i];
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Checks that destination buffers match expected values
+  static ErrResult ValidateAllTransfers(ConfigOptions              const& cfg,
+                                        vector<Transfer>           const& transfers,
+                                        vector<TransferResources*> const& transferResources,
+                                        vector<vector<float>>      const& dstReference,
+                                        vector<float>&                    outputBuffer)
+  {
+    float* output;
+    size_t initOffset = cfg.data.byteOffset / sizeof(float);
+
+    for (auto resource : transferResources) {
+      int transferIdx = resource->transferIdx;
+      Transfer const& t = transfers[transferIdx];
+      size_t N = t.numBytes / sizeof(float);
+
+      float const* expected = dstReference[t.srcs.size()].data();
+      for (int dstIdx = 0; dstIdx < resource->dstMem.size(); dstIdx++) {
+        if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) {
+          output = (resource->dstMem[dstIdx]) + initOffset;
+        } else {
+          ERR_CHECK(hipMemcpy(outputBuffer.data(), (resource->dstMem[dstIdx]) + initOffset, t.numBytes, hipMemcpyDefault));
+          ERR_CHECK(hipDeviceSynchronize());
+          output = outputBuffer.data();
+        }
+
+        if (memcmp(output, expected, t.numBytes)) {
+          // Difference found - find first error
+          for (size_t i = 0; i < N; i++) {
+            if (output[i] != expected[i]) {
+              return {ERR_FATAL, "Transfer %d: Unexpected mismatch at index %lu of destination %d: Expected %10.5f Actual: %10.5f",
+                transferIdx, i, dstIdx, expected[i], output[i]};
+            }
+          }
+          return {ERR_FATAL, "Transfer %d: Unexpected output mismatch for destination %d", transferIdx, dstIdx};
+        }
+      }
+    }
+    return ERR_NONE;
+  }
+
+// Preparation-related functions
+//========================================================================================
+
+  // Prepares input parameters for each subexecutor
+  // Determines how sub-executors will split up the work
+  // Initializes counters
+  static ErrResult PrepareSubExecParams(ConfigOptions const& cfg,
+                                        Transfer      const& transfer,
+                                        TransferResources&   resources)
+  {
+    // Each subExecutor needs to know src/dst pointers and how many elements to transfer
+    // Figure out the sub-array each subExecutor works on for this Transfer
+    // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of data.blockBytes
+    //   except the very last one, for alignment reasons
+    size_t const N              = transfer.numBytes / sizeof(float);
+    int    const initOffset     = cfg.data.byteOffset / sizeof(float);
+    int    const targetMultiple = cfg.data.blockBytes / sizeof(float);
+
+    // In some cases, there may not be enough data for all subExectors
+    int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
+                                         (size_t)transfer.numSubExecs);
+
+    vector<SubExecParam>& subExecParam = resources.subExecParamCpu;
+    subExecParam.clear();
+    subExecParam.resize(transfer.numSubExecs);
+
+    size_t assigned = 0;
+    for (int i = 0; i < transfer.numSubExecs; ++i) {
+      SubExecParam& p  = subExecParam[i];
+      p.numSrcs        = resources.srcMem.size();
+      p.numDsts        = resources.dstMem.size();
+      p.startCycle     = 0;
+      p.stopCycle      = 0;
+      p.hwId           = 0;
+      p.xccId          = 0;
+
+      // In single team mode, subexecutors stripe across the entire array
+      if (cfg.gfx.useSingleTeam && transfer.exeDevice.exeType == EXE_GPU_GFX) {
+        p.N        = N;
+        p.teamSize = transfer.numSubExecs;
+        p.teamIdx  = i;
+        for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = resources.srcMem[iSrc] + initOffset;
+        for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = resources.dstMem[iDst] + initOffset;
+      } else {
+        // Otherwise, each subexecutor works on separate subarrays
+        int    const subExecLeft = std::max(0, maxSubExecToUse - i);
+        size_t const leftover    = N - assigned;
+        size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
+
+        p.N        = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
+        p.teamSize = 1;
+        p.teamIdx  = 0;
+        for (int iSrc = 0; iSrc < p.numSrcs; ++iSrc) p.src[iSrc] = resources.srcMem[iSrc] + initOffset + assigned;
+        for (int iDst = 0; iDst < p.numDsts; ++iDst) p.dst[iDst] = resources.dstMem[iDst] + initOffset + assigned;
+        assigned += p.N;
+      }
+
+      p.preferredXccId = transfer.exeSubIndex;
+      // Override if XCC table has been specified
+      vector<vector<int>> const& table = cfg.gfx.prefXccTable;
+      if (transfer.exeDevice.exeType == EXE_GPU_GFX && transfer.exeSubIndex == -1 && !table.empty() &&
+          transfer.dsts.size() == 1 && IsGpuMemType(transfer.dsts[0].memType)) {
+        if (table.size() <= transfer.exeDevice.exeIndex ||
+            table[transfer.exeDevice.exeIndex].size() <= transfer.dsts[0].memIndex) {
+          return {ERR_FATAL, "[gfx.xccPrefTable] is too small"};
+        }
+        p.preferredXccId = table[transfer.exeDevice.exeIndex][transfer.dsts[0].memIndex];
+        if (p.preferredXccId < 0 || p.preferredXccId >= GetNumExecutorSubIndices(transfer.exeDevice)) {
+          return {ERR_FATAL, "[gfx.xccPrefTable] defines out-of-bound XCC index %d", p.preferredXccId};
+        }
+      }
+    }
+
+    // Clear counters
+    resources.totalDurationMsec = 0.0;
+
+    return ERR_NONE;
+  }
+
+  // Prepare each executor
+  // Allocates memory for src/dst, prepares subexecutors, executor-specific data structures
+  static ErrResult PrepareExecutor(ConfigOptions    const& cfg,
+                                   vector<Transfer> const& transfers,
+                                   ExeDevice        const& exeDevice,
+                                   ExeInfo&                exeInfo)
+  {
+    exeInfo.totalDurationMsec = 0.0;
+
+    // Loop over each transfer this executor is involved in
+    for (auto& resources : exeInfo.resources) {
+      Transfer const& t = transfers[resources.transferIdx];
+      resources.numBytes = t.numBytes;
+
+      // Allocate source memory
+      resources.srcMem.resize(t.srcs.size());
+      for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
+        MemDevice const& srcMemDevice = t.srcs[iSrc];
+
+        // Ensure executing GPU can access source memory
+        if (exeDevice.exeType == EXE_GPU_GFX && IsGpuMemType(srcMemDevice.memType) &&
+            srcMemDevice.memIndex != exeDevice.exeIndex) {
+          ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, srcMemDevice.memIndex));
+        }
+        ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&resources.srcMem[iSrc]));
+      }
+
+      // Allocate destination memory
+      resources.dstMem.resize(t.dsts.size());
+      for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
+        MemDevice const& dstMemDevice = t.dsts[iDst];
+
+        // Ensure executing GPU can access destination memory
+        if (exeDevice.exeType == EXE_GPU_GFX && IsGpuMemType(dstMemDevice.memType) &&
+            dstMemDevice.memIndex != exeDevice.exeIndex) {
+          ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, dstMemDevice.memIndex));
+        }
+        ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&resources.dstMem[iDst]));
+      }
+
+      if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
+#if !defined(__NVCC__)
+        // Collect HSA agent information
+        hsa_amd_pointer_info_t info;
+        info.size = sizeof(info);
+        ERR_CHECK(hsa_amd_pointer_info(resources.dstMem[0], &info, NULL, NULL, NULL));
+        resources.dstAgent = info.agentOwner;
+
+        ERR_CHECK(hsa_amd_pointer_info(resources.srcMem[0], &info, NULL, NULL, NULL));
+        resources.srcAgent = info.agentOwner;
+
+        // Create HSA completion signal
+        ERR_CHECK(hsa_signal_create(1, 0, NULL, &resources.signal));
+#endif
+      }
+
+      // Prepare subexecutor parameters
+      ERR_CHECK(PrepareSubExecParams(cfg, t, resources));
+    }
+
+    // Prepare additional requirements for GPU-based executors
+    if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
+      ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
+
+      // Determine how many streams to use
+      int const numStreamsToUse = (exeDevice.exeType == EXE_GPU_DMA ||
+                                   (exeDevice.exeType == EXE_GPU_GFX && cfg.gfx.useMultiStream))
+        ? exeInfo.resources.size() : 1;
+
+      exeInfo.streams.resize(numStreamsToUse);
+
+      // Create streams
+      for (int i = 0; i < numStreamsToUse; ++i) {
+        if (cfg.gfx.cuMask.size()) {
+#if !defined(__NVCC__)
+          ERR_CHECK(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], cfg.gfx.cuMask.size(),
+                                                 cfg.gfx.cuMask.data()));
+#else
+          return {ERR_FATAL, "CU Masking in not supported on NVIDIA hardware"};
+#endif
+        } else {
+          ERR_CHECK(hipStreamCreate(&exeInfo.streams[i]));
+        }
+      }
+
+      if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
+        exeInfo.startEvents.resize(numStreamsToUse);
+        exeInfo.stopEvents.resize(numStreamsToUse);
+        for (int i = 0; i < numStreamsToUse; ++i) {
+          ERR_CHECK(hipEventCreate(&exeInfo.startEvents[i]));
+          ERR_CHECK(hipEventCreate(&exeInfo.stopEvents[i]));
+        }
+      }
+    }
+
+    // Prepare for GPU GFX executor
+    if (exeDevice.exeType == EXE_GPU_GFX) {
+      // Allocate one contiguous chunk of GPU memory for threadblock parameters
+      // This allows support for executing one transfer per stream, or all transfers in a single stream
+#if !defined(__NVCC__)
+      MemType memType = MEM_GPU;      // AMD hardware can directly access GPU memory from host
+#else
+      MemType memType = MEM_MANAGED;  // NVIDIA hardware requires managed memory to access from host
+#endif
+      ERR_CHECK(AllocateMemory({memType, exeDevice.exeIndex}, exeInfo.totalSubExecs * sizeof(SubExecParam),
+                               (void**)&exeInfo.subExecParamGpu));
+
+      // Create subexecutor parameter array for entire executor
+      exeInfo.subExecParamCpu.clear();
+      exeInfo.numSubIndices = GetNumExecutorSubIndices(exeDevice);
+#if defined(__NVCC__)
+      exeInfo.wallClockRate = 1000000;
+#else
+      ERR_CHECK(hipDeviceGetAttribute(&exeInfo.wallClockRate, hipDeviceAttributeWallClockRate,
+                                      exeDevice.exeIndex));
+#endif
+      int transferOffset = 0;
+      for (auto& resources : exeInfo.resources) {
+        Transfer const& t = transfers[resources.transferIdx];
+        resources.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
+        for (auto p : resources.subExecParamCpu) {
+          resources.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
+          exeInfo.subExecParamCpu.push_back(p);
+          transferOffset++;
+        }
+      }
+
+      // Copy sub executor parameters to GPU
+      ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
+      ERR_CHECK(hipMemcpy(exeInfo.subExecParamGpu,
+                          exeInfo.subExecParamCpu.data(),
+                          exeInfo.totalSubExecs * sizeof(SubExecParam),
+                          hipMemcpyHostToDevice));
+      ERR_CHECK(hipDeviceSynchronize());
+    }
+
+    return ERR_NONE;
+  }
+
+// Teardown-related functions
+//========================================================================================
+
+  // Clean up all resources
+  static ErrResult TeardownExecutor(ConfigOptions    const& cfg,
+                                    ExeDevice        const& exeDevice,
+                                    vector<Transfer> const& transfers,
+                                    ExeInfo&                exeInfo)
+  {
+    // Loop over each transfer this executor is involved in
+    for (auto& resources : exeInfo.resources) {
+      Transfer const& t = transfers[resources.transferIdx];
+
+      // Deallocate source memory
+      for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
+        ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, resources.srcMem[iSrc], t.numBytes + cfg.data.byteOffset));
+      }
+
+      // Deallocate destination memory
+      for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
+        ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, resources.dstMem[iDst], t.numBytes + cfg.data.byteOffset));
+      }
+
+      // Destroy HSA signal for DMA executor
+#if !defined(__NVCC__)
+      if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) {
+        ERR_CHECK(hsa_signal_destroy(resources.signal));
+      }
+#endif
+    }
+
+    // Teardown additional requirements for GPU-based executors
+    if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) {
+      for (auto stream : exeInfo.streams)
+        ERR_CHECK(hipStreamDestroy(stream));
+      if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
+        for (auto event : exeInfo.startEvents)
+          ERR_CHECK(hipEventDestroy(event));
+        for (auto event : exeInfo.stopEvents)
+          ERR_CHECK(hipEventDestroy(event));
+      }
+    }
+
+    if (exeDevice.exeType == EXE_GPU_GFX) {
+#if !defined(__NVCC__)
+      MemType memType = MEM_GPU;
+#else
+      MemType memType = MEM_MANAGED;
+#endif
+      ERR_CHECK(DeallocateMemory(memType, exeInfo.subExecParamGpu, exeInfo.totalSubExecs * sizeof(SubExecParam)));
+    }
+
+    return ERR_NONE;
+  }
+
+// CPU Executor-related functions
+//========================================================================================
+
+  // Kernel for CPU execution (run by a single subexecutor)
+  static void CpuReduceKernel(SubExecParam const& p)
+  {
+    if (p.N == 0) return;
+
+    int const& numSrcs = p.numSrcs;
+    int const& numDsts = p.numDsts;
+
+    if (numSrcs == 0) {
+      for (int i = 0; i < numDsts; ++i) {
+        memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
+        //for (int j = 0; j < p.N; j++) p.dst[i][j] = MEMSET_VAL;
+      }
+    } else if (numSrcs == 1) {
+      float const* __restrict__ src = p.src[0];
+      if (numDsts == 0) {
+        float sum = 0.0;
+        for (int j = 0; j < p.N; j++)
+          sum += p.src[0][j];
+
+        // Add a dummy check to ensure the read is not optimized out
+        if (sum != sum) {
+          printf("[ERROR] Nan detected\n");
+        }
+      } else {
+        for (int i = 0; i < numDsts; ++i)
+          memcpy(p.dst[i], src, p.N * sizeof(float));
+      }
+    } else {
+      float sum = 0.0f;
+      for (int j = 0; j < p.N; j++) {
+        sum = p.src[0][j];
+        for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
+        for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
+      }
+    }
+  }
+
+  // Execution of a single CPU Transfers
+  static void ExecuteCpuTransfer(int           const  iteration,
+                                 ConfigOptions const& cfg,
+                                 int           const  exeIndex,
+                                 TransferResources&   resources)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    vector<std::thread> childThreads;
+    int subIteration = 0;
+    do {
+      for (auto const& subExecParam : resources.subExecParamCpu)
+        childThreads.emplace_back(std::thread(CpuReduceKernel, std::cref(subExecParam)));
+
+      for (auto& subExecThread : childThreads)
+        subExecThread.join();
+      childThreads.clear();
+    } while (++subIteration != cfg.general.numSubIterations);
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+
+    if (iteration >= 0) {
+      resources.totalDurationMsec += deltaMsec;
+      if (cfg.general.recordPerIteration)
+        resources.perIterMsec.push_back(deltaMsec);
+    }
+  }
+
+  // Execution of a single CPU executor
+  static ErrResult RunCpuExecutor(int           const  iteration,
+                                  ConfigOptions const& cfg,
+                                  int           const  exeIndex,
+                                  ExeInfo&             exeInfo)
+  {
+    numa_run_on_node(exeIndex);
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    vector<std::thread> asyncTransfers;
+    for (auto& resource : exeInfo.resources) {
+      asyncTransfers.emplace_back(std::thread(ExecuteCpuTransfer,
+                                              iteration,
+                                              std::cref(cfg),
+                                              exeIndex,
+                                              std::ref(resource)));
+    }
+    for (auto& asyncTransfer : asyncTransfers)
+      asyncTransfer.join();
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    if (iteration >= 0)
+      exeInfo.totalDurationMsec += deltaMsec;
+    return ERR_NONE;
+  }
+
+// GFX Executor-related functions
+//========================================================================================
+
+  // Converts register value to a CU/SM index
+  static uint32_t GetId(uint32_t hwId)
+  {
+#if defined(__NVCC_)
+    return hwId;
+#else
+    // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
+    int const shId = (hwId >> 12) &  1;
+    int const cuId = (hwId >>  8) & 15;
+    int const seId = (hwId >> 13) &  3;
+    return (shId << 5) + (cuId << 2) + seId;
+#endif
+  }
+
+  // Device level timestamp function
+  __device__ int64_t GetTimestamp()
+  {
+#if defined(__NVCC__)
+    int64_t result;
+    asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
+    return result;
+#else
+    return wall_clock64();
+#endif
+  }
+
+  // Helper function for memset
+  template <typename T> __device__ __forceinline__ T      MemsetVal();
+  template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
+  template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL,
+                                                                                          MEMSET_VAL,
+                                                                                          MEMSET_VAL,
+                                                                                          MEMSET_VAL); }
+
+  // Kernel for GFX execution
+  template <int BLOCKSIZE, int UNROLL>
+  __global__ void __launch_bounds__(BLOCKSIZE)
+    GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
+  {
+    int64_t startCycle;
+    if (threadIdx.x == 0) startCycle = GetTimestamp();
+
+    SubExecParam& p = params[blockIdx.y];
+
+    // Filter by XCC
+#if !defined(__NVCC__)
+    int32_t xccId;
+    GetXccId(xccId);
+    if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
+#endif
+
+    // Collect data information
+    int32_t const  numSrcs  = p.numSrcs;
+    int32_t const  numDsts  = p.numDsts;
+    float4  const* __restrict__ srcFloat4[MAX_SRCS];
+    float4*        __restrict__ dstFloat4[MAX_DSTS];
+    for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i];
+    for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i];
+
+    // Operate on wavefront granularity
+    int32_t const nTeams   = p.teamSize;             // Number of threadblocks working together on this subarray
+    int32_t const teamIdx  = p.teamIdx;              // Index of this threadblock within the team
+    int32_t const nWaves   = BLOCKSIZE   / warpSize; // Number of wavefronts within this threadblock
+    int32_t const waveIdx  = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
+    int32_t const tIdx     = threadIdx.x % warpSize; // Thread index within wavefront
+
+    size_t  const numFloat4 = p.N / 4;
+
+    int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
+    switch (waveOrder) {
+    case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+    case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams;  teamStride2 = 1;      waveStride2 = nTeams; break;
+    case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+    case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+    case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL;  teamStride2 = 1;      waveStride2 = nTeams; break;
+    case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves;  teamStride2 = 1;      waveStride2 = nTeams; break;
+    }
+
+    int subIterations = 0;
+    while (1) {
+      // First loop: Each wavefront in the team works on UNROLL float4s per thread
+      size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
+      size_t const loop1Limit  = numFloat4 / loop1Stride * loop1Stride;
+      {
+        float4 val[UNROLL];
+        if (numSrcs == 0) {
+          #pragma unroll
+          for (int u = 0; u < UNROLL; u++)
+            val[u] = MemsetVal<float4>();
+        }
+
+        for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
+          // Read sources into memory and accumulate in registers
+          if (numSrcs) {
+            for (int u = 0; u < UNROLL; u++)
+              val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
+            for (int s = 1; s < numSrcs; s++)
+              for (int u = 0; u < UNROLL; u++)
+                val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
+          }
+
+          // Write accumulation to all outputs
+          for (int d = 0; d < numDsts; d++) {
+            #pragma unroll
+            for (int u = 0; u < UNROLL; u++)
+              dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
+          }
+        }
+      }
+
+      // Second loop: Deal with remaining float4s
+      {
+        if (loop1Limit < numFloat4) {
+          float4 val;
+          if (numSrcs == 0) val = MemsetVal<float4>();
+
+          size_t const loop2Stride = nTeams * nWaves * warpSize;
+          for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
+               idx < numFloat4; idx += loop2Stride) {
+            if (numSrcs) {
+              val = srcFloat4[0][idx];
+              for (int s = 1; s < numSrcs; s++)
+                val += srcFloat4[s][idx];
+            }
+            for (int d = 0; d < numDsts; d++)
+              dstFloat4[d][idx] = val;
+          }
+        }
+      }
+
+      // Third loop; Deal with remaining floats
+      {
+        if (numFloat4 * 4 < p.N) {
+          float val;
+          if (numSrcs == 0) val = MemsetVal<float>();
+
+          size_t const loop3Stride = nTeams * nWaves * warpSize;
+          for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
+            if (numSrcs) {
+              val = p.src[0][idx];
+              for (int s = 1; s < numSrcs; s++)
+                val += p.src[s][idx];
+            }
+
+            for (int d = 0; d < numDsts; d++)
+              p.dst[d][idx] = val;
+          }
+        }
+      }
+
+      if (++subIterations == numSubIterations) break;
+    }
+
+    // Wait for all threads to finish
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      __threadfence_system();
+      p.stopCycle  = GetTimestamp();
+      p.startCycle = startCycle;
+      GetHwId(p.hwId);
+      GetXccId(p.xccId);
+    }
+  }
+
+#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)   \
+    {GpuReduceKernel<BLOCKSIZE, 1>,         \
+     GpuReduceKernel<BLOCKSIZE, 2>,         \
+     GpuReduceKernel<BLOCKSIZE, 3>,         \
+     GpuReduceKernel<BLOCKSIZE, 4>,         \
+     GpuReduceKernel<BLOCKSIZE, 5>,         \
+     GpuReduceKernel<BLOCKSIZE, 6>,         \
+     GpuReduceKernel<BLOCKSIZE, 7>,         \
+     GpuReduceKernel<BLOCKSIZE, 8>}
+
+  // Table of all GPU Reduction kernel functions (templated blocksize / unroll)
+  typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
+  GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] =
+  {
+    GPU_KERNEL_UNROLL_DECL(64),
+    GPU_KERNEL_UNROLL_DECL(128),
+    GPU_KERNEL_UNROLL_DECL(192),
+    GPU_KERNEL_UNROLL_DECL(256),
+    GPU_KERNEL_UNROLL_DECL(320),
+    GPU_KERNEL_UNROLL_DECL(384),
+    GPU_KERNEL_UNROLL_DECL(448),
+    GPU_KERNEL_UNROLL_DECL(512)
+  };
+  #undef GPU_KERNEL_UNROLL_DECL
+
+  // Execute a single GPU Transfer (when using 1 stream per Transfer)
+  static ErrResult ExecuteGpuTransfer(int           const  iteration,
+                                      hipStream_t   const  stream,
+                                      int           const  xccDim,
+                                      ConfigOptions const& cfg,
+                                      TransferResources&   resources)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    int numSubExecs = resources.subExecParamCpu.size();
+    dim3 const gridSize(xccDim, numSubExecs, 1);
+    dim3 const blockSize(cfg.gfx.blockSize, 1);
+
+#if defined(__NVCC__)
+    GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
+      <<<gridSize, blockSize, 0, stream>>>
+      (resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+#else
+    hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
+                          gridSize, blockSize, 0, stream,
+                          NULL, NULL,
+                          0, resources.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+#endif
+
+    ERR_CHECK(hipStreamSynchronize(stream));
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+
+    if (iteration >= 0) {
+      resources.totalDurationMsec += deltaMsec;
+      if (cfg.general.recordPerIteration) {
+        resources.perIterMsec.push_back(deltaMsec);
+#if !defined(__NVCC__)
+        std::set<std::pair<int,int>> CUs;
+        for (int i = 0; i < numSubExecs; i++) {
+          CUs.insert(std::make_pair(resources.subExecParamGpuPtr[i].xccId,
+                                    GetId(resources.subExecParamGpuPtr[i].hwId)));
+        }
+        resources.perIterCUs.push_back(CUs);
+#endif
+      }
+    }
+    return ERR_NONE;
+  }
+
+  // Execute a single GPU executor
+  static ErrResult RunGpuExecutor(int           const  iteration,
+                                  ConfigOptions const& cfg,
+                                  int           const  exeIndex,
+                                  ExeInfo&             exeInfo)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    int xccDim = exeInfo.useSubIndices ? exeInfo.numSubIndices : 1;
+
+    if (cfg.gfx.useMultiStream) {
+      // Launch each Transfer separately in its own stream
+      vector<std::future<ErrResult>> asyncTransfers;
+      for (int i = 0; i < exeInfo.streams.size(); i++) {
+        asyncTransfers.emplace_back(std::async(std::launch::async,
+                                               ExecuteGpuTransfer,
+                                               iteration,
+                                               exeInfo.streams[i],
+                                               xccDim,
+                                               std::cref(cfg),
+                                               std::ref(exeInfo.resources[i])));
+      }
+      for (auto& asyncTransfer : asyncTransfers)
+        ERR_CHECK(asyncTransfer.get());
+    } else {
+      // Combine all the Transfers into a single kernel launch
+      int numSubExecs = exeInfo.totalSubExecs;
+      dim3 const gridSize(xccDim, numSubExecs, 1);
+      dim3 const blockSize(cfg.gfx.blockSize, 1);
+      hipStream_t stream = exeInfo.streams[0];
+
+#if defined(__NVCC__)
+      if (cfg.gfx.useHipEvents)
+        ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream));
+
+      GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
+        <<<gridSize, blockSize, 0 , stream>>>
+        (exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+
+      if (cfg.gfx.useHipEvents)
+        ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream));
+#else
+      hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
+                            gridSize, blockSize, 0, stream,
+                            cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
+                            cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0,
+                            exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+#endif
+      ERR_CHECK(hipStreamSynchronize(stream));
+    }
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+
+    if (iteration >= 0) {
+      if (cfg.gfx.useHipEvents) {
+        float gpuDeltaMsec;
+        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
+        exeInfo.totalDurationMsec += gpuDeltaMsec;
+      } else {
+        exeInfo.totalDurationMsec += cpuDeltaMsec;
+      }
+
+      // Determine timing for each of the individual transfers that were part of this launch
+      for (int i = 0; i < exeInfo.resources.size(); i++) {
+        TransferResources& resources = exeInfo.resources[i];
+        long long minStartCycle = std::numeric_limits<long long>::max();
+        long long maxStopCycle  = std::numeric_limits<long long>::min();
+        std::set<std::pair<int, int>> CUs;
+
+        for (auto subExecIdx : resources.subExecIdx) {
+          minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
+          maxStopCycle  = std::max(maxStopCycle,  exeInfo.subExecParamGpu[subExecIdx].stopCycle);
+          if (cfg.general.recordPerIteration) {
+#if !defined(__NVCC__)
+            CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
+                                      GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
+#endif
+          }
+        }
+        double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
+
+        resources.totalDurationMsec += deltaMsec;
+        if (cfg.general.recordPerIteration) {
+          resources.perIterMsec.push_back(deltaMsec);
+#if !defined(__NVCC__)
+          resources.perIterCUs.push_back(CUs);
+#endif
+        }
+      }
+    }
+
+    return ERR_NONE;
+  }
+
+
+// DMA Executor-related functions
+//========================================================================================
+
+  // Execute a single DMA Transfer
+  static ErrResult ExecuteDmaTransfer(int           const  iteration,
+                                      bool          const  useSubIndices,
+                                      hipStream_t   const  stream,
+                                      hipEvent_t    const  startEvent,
+                                      hipEvent_t    const  stopEvent,
+                                      ConfigOptions const& cfg,
+                                      TransferResources&   resources)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+
+    int subIterations = 0;
+    if (!useSubIndices && !cfg.dma.useHsaCopy) {
+      if (cfg.dma.useHipEvents)
+        ERR_CHECK(hipEventRecord(startEvent, stream));
+
+      // Use hipMemcpy
+      do {
+        ERR_CHECK(hipMemcpyAsync(resources.dstMem[0], resources.srcMem[0], resources.numBytes,
+                                 hipMemcpyDefault, stream));
+      } while (++subIterations != cfg.general.numSubIterations);
+
+      if (cfg.dma.useHipEvents)
+        ERR_CHECK(hipEventRecord(stopEvent, stream));
+      ERR_CHECK(hipStreamSynchronize(stream));
+    } else {
+#if defined(__NVCC__)
+      return {ERR_FATAL, "HSA copy not supported on NVIDIA hardware"};
+#else
+      // Use HSA async copy
+      do {
+        hsa_signal_store_screlease(resources.signal, 1);
+        if (cfg.dma.useHsaCopy) {
+          ERR_CHECK(hsa_amd_memory_async_copy(resources.dstMem[0], resources.dstAgent,
+                                              resources.srcMem[0], resources.srcAgent,
+                                              resources.numBytes, 0, NULL,
+                                              resources.signal));
+        } else {
+          HSA_CALL(hsa_amd_memory_async_copy_on_engine(resources.dstMem[0], resources.dstAgent,
+                                                       resources.srcMem[0], resources.srcAgent,
+                                                       resources.numBytes, 0, NULL,
+                                                       resources.signal,
+                                                       resources.sdmaEngineId, true));
+        }
+        // Wait for SDMA transfer to complete
+        while(hsa_signal_wait_scacquire(resources.signal,
+                                        HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
+                                        HSA_WAIT_STATE_ACTIVE) >= 1);
+      } while (++subIterations != cfg.general.numSubIterations);
+#endif
+    }
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+
+    if (iteration >= 0) {
+      double deltaMsec = cpuDeltaMsec;
+      if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
+        float gpuDeltaMsec;
+        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
+        deltaMsec = gpuDeltaMsec;
+      }
+      resources.totalDurationMsec += deltaMsec;
+      if (cfg.general.recordPerIteration)
+        resources.perIterMsec.push_back(deltaMsec);
+    }
+    return ERR_NONE;
+  }
+
+  // Execute a single DMA executor
+  static ErrResult RunDmaExecutor(int           const  iteration,
+                                  ConfigOptions const& cfg,
+                                  int           const  exeIndex,
+                                  ExeInfo&             exeInfo)
+  {
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    ERR_CHECK(hipSetDevice(exeIndex));
+
+    vector<std::future<ErrResult>> asyncTransfers;
+    for (int i = 0; i < exeInfo.resources.size(); i++) {
+      asyncTransfers.emplace_back(std::async(std::launch::async,
+                                             ExecuteDmaTransfer,
+                                             iteration,
+                                             exeInfo.useSubIndices,
+                                             exeInfo.streams[i],
+                                             cfg.dma.useHipEvents ? exeInfo.startEvents[i] : NULL,
+                                             cfg.dma.useHipEvents ? exeInfo.stopEvents[i]  : NULL,
+                                             std::cref(cfg),
+                                             std::ref(exeInfo.resources[i])));
+    }
+
+    for (auto& asyncTransfer : asyncTransfers)
+      ERR_CHECK(asyncTransfer.get());
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    if (iteration >= 0)
+      exeInfo.totalDurationMsec += deltaMsec;
+    return ERR_NONE;
+  }
+
+// Executor-related functions
+//========================================================================================
+  static ErrResult RunExecutor(int           const  iteration,
+                               ConfigOptions const& cfg,
+                               ExeDevice     const& exeDevice,
+                               ExeInfo&             exeInfo)
+  {
+    switch (exeDevice.exeType) {
+    case EXE_CPU:     return RunCpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_GPU_GFX: return RunGpuExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    case EXE_GPU_DMA: return RunDmaExecutor(iteration, cfg, exeDevice.exeIndex, exeInfo);
+    default:          return {ERR_FATAL, "Unsupported executor (%d)", exeDevice.exeType};
+    }
+  }
+
+} // End of anonymous namespace
+//========================================================================================
+
+  ErrResult::ErrResult(ErrType err) : errType(err), errMsg("") {};
+
+  ErrResult::ErrResult(hipError_t err)
+  {
+    if (err == hipSuccess) {
+      this->errType = ERR_NONE;
+      this->errMsg  = "";
+    } else {
+      this->errType = ERR_FATAL;
+      this->errMsg  = std::string("HIP Error: ") + hipGetErrorString(err);
+    }
+  }
+
+#if !defined(__NVCC__)
+  ErrResult::ErrResult(hsa_status_t err)
+  {
+    if (err == HSA_STATUS_SUCCESS) {
+      this->errType = ERR_NONE;
+      this->errMsg  = "";
+    } else {
+      const char *errString = NULL;
+      hsa_status_string(err, &errString);
+      this->errType = ERR_FATAL;
+      this->errMsg  = std::string("HSA Error: ") + errString;
+    }
+  }
+#endif
+
+  ErrResult::ErrResult(ErrType errType, const char* format, ...)
+  {
+    this->errType = errType;
+    va_list args, args_temp;
+    va_start(args, format);
+    va_copy(args_temp, args);
+
+    int len = vsnprintf(nullptr, 0, format, args);
+    if (len < 0) {
+      va_end(args_temp);
+      va_end(args);
+    } else {
+      this->errMsg.resize(len);
+      vsnprintf(this->errMsg.data(), len+1, format, args_temp);
+    }
+    va_end(args_temp);
+    va_end(args);
+  }
+
+  bool RunTransfers(ConfigOptions         const& cfg,
+                    std::vector<Transfer> const& transfers,
+                    TestResults&                 results)
+  {
+    // Clear all errors;
+    auto& errResults = results.errResults;
+    errResults.clear();
+
+    // Check for valid configuration
+    if (ConfigOptionsHaveErrors(cfg, errResults)) return false;
+
+    // Check for valid transfers
+    if (TransfersHaveErrors(cfg, transfers, errResults)) return false;
+
+    // Collect up transfers by executor
+    int minNumSrcs = MAX_SRCS + 1;
+    int maxNumSrcs = 0;
+    size_t maxNumBytes = 0;
+    std::map<ExeDevice, ExeInfo> executorMap;
+    for (int i = 0; i < transfers.size(); i++) {
+      Transfer const& t = transfers[i];
+
+      ExeInfo& exeInfo = executorMap[t.exeDevice];
+      exeInfo.totalBytes    += t.numBytes;
+      exeInfo.totalSubExecs += t.numSubExecs;
+      exeInfo.useSubIndices |= (t.exeSubIndex != -1);
+
+      TransferResources resource = {};
+      resource.transferIdx = i;
+      exeInfo.resources.push_back(resource);
+
+      minNumSrcs  = std::min(minNumSrcs, (int)t.srcs.size());
+      maxNumSrcs  = std::max(maxNumSrcs, (int)t.srcs.size());
+      maxNumBytes = std::max(maxNumBytes, t.numBytes);
+    }
+
+    // Loop over each executor and prepare
+    // - Allocates memory for each Transfer
+    // - Set up work for subexecutors
+    vector<TransferResources*> transferResources;
+    for (auto& exeInfoPair : executorMap) {
+      ExeDevice const& exeDevice = exeInfoPair.first;
+      ExeInfo&         exeInfo   = exeInfoPair.second;
+      ERR_APPEND(PrepareExecutor(cfg, transfers, exeDevice, exeInfo), errResults);
+
+      for (auto& resource : exeInfo.resources) {
+        transferResources.push_back(&resource);
+      }
+    }
+
+    // Prepare reference src/dst arrays - only once for largest size
+    size_t maxN = maxNumBytes / sizeof(float);
+    vector<float> outputBuffer(maxN);
+    vector<vector<float>> dstReference(maxNumSrcs + 1, vector<float>(maxN));
+    {
+      vector<vector<float>> srcReference(maxNumSrcs, vector<float>(maxN));
+      memset(dstReference[0].data(), MEMSET_CHAR, maxNumBytes);
+
+      for (int numSrcs = 0; numSrcs < maxNumSrcs; numSrcs++) {
+        PrepareReference(cfg, srcReference[numSrcs], numSrcs);
+        for (int i = 0; i < maxN; i++) {
+          dstReference[numSrcs+1][i] = (numSrcs == 0 ? 0 : dstReference[numSrcs][i]) + srcReference[numSrcs][i];
+        }
+      }
+      // Release un-used partial sums
+      for (int numSrcs = 0; numSrcs < minNumSrcs; numSrcs++)
+        dstReference[numSrcs].clear();
+
+      // Initialize all src memory buffers
+      for (auto resource : transferResources) {
+        for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) {
+          ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx], srcReference[srcIdx].data(), resource->numBytes,
+                               hipMemcpyDefault), errResults);
+        }
+      }
+    }
+
+    // Perform iterations
+    size_t numTimedIterations = 0;
+    double totalCpuTimeSec = 0.0;
+    for (int iteration = -cfg.general.numWarmups; ; iteration++) {
+      // Stop if number of iterations/seconds has reached limit
+      if (cfg.general.numIterations > 0 && iteration >= cfg.general.numIterations) break;
+      if (cfg.general.numIterations < 0 && totalCpuTimeSec > -cfg.general.numIterations) break;
+
+      // Pause before starting first timed iteration in iteractive mode
+      if (cfg.general.useInteractive && iteration == 0) {
+        printf("Memory prepared:\n");
+
+        for (int i = 0; i < transfers.size(); i++) {
+          ExeInfo const& exeInfo = executorMap[transfers[i].exeDevice];
+          printf("Transfer %03d:\n", i);
+          for (int iSrc = 0; iSrc < transfers[i].srcs.size(); ++iSrc)
+            printf("  SRC %0d: %p\n", iSrc, exeInfo.resources[i].srcMem[iSrc]);
+          for (int iDst = 0; iDst < transfers[i].dsts.size(); ++iDst)
+            printf("  DST %0d: %p\n", iDst, exeInfo.resources[i].dstMem[iDst]);
+        }
+        printf("Hit <Enter> to continue: ");
+        if (scanf("%*c") != 0) {
+          printf("[ERROR] Unexpected input\n");
+          exit(1);
+        }
+        printf("\n");
+      }
+
+      // Start CPU timing for this iteration
+      auto cpuStart = std::chrono::high_resolution_clock::now();
+
+      // Execute all Transfers in parallel
+      std::vector<std::future<ErrResult>> asyncExecutors;
+      for (auto& exeInfoPair : executorMap) {
+        asyncExecutors.emplace_back(std::async(std::launch::async, RunExecutor,
+                                               iteration,
+                                               std::cref(cfg),
+                                               std::cref(exeInfoPair.first),
+                                               std::ref(exeInfoPair.second)));
+      }
+
+      // Wait for all threads to finish
+      for (auto& asyncExecutor : asyncExecutors) {
+        ERR_APPEND(asyncExecutor.get(), errResults);
+      }
+
+       // Stop CPU timing for this iteration
+      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+      double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+
+      if (cfg.data.alwaysValidate) {
+        ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
+                   errResults);
+      }
+
+      if (iteration >= 0) {
+        ++numTimedIterations;
+        totalCpuTimeSec += deltaSec;
+      }
+    }
+
+    // Pause for interactive mode
+    if (cfg.general.useInteractive) {
+      printf("Transfers complete. Hit <Enter> to continue: ");
+      if (scanf("%*c") != 0)  {
+        printf("[ERROR] Unexpected input\n");
+        exit(1);
+      }
+      printf("\n");
+    }
+
+    // Validate results
+    if (!cfg.data.alwaysValidate) {
+      ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
+                 errResults);
+    }
+
+    // Prepare results
+    results.exeResults.clear();
+    results.tfrResults.clear();
+    results.tfrResults.resize(transfers.size());
+    results.numTimedIterations = numTimedIterations;
+    results.totalBytesTransferred = 0;
+    results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / numTimedIterations;
+    results.overheadMsec = 0.0;
+    for (auto& exeInfoPair : executorMap) {
+      ExeDevice const& exeDevice = exeInfoPair.first;
+      ExeInfo&         exeInfo   = exeInfoPair.second;
+
+      // Copy over executor results
+      ExeResult& exeResult = results.exeResults[exeDevice];
+      exeResult.numBytes = exeInfo.totalBytes;
+      exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations;
+      exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) /  exeResult.avgDurationMsec;
+      exeResult.sumBandwidthGbPerSec = 0.0;
+      exeResult.transferIdx.clear();
+      results.totalBytesTransferred += exeInfo.totalBytes;
+      results.overheadMsec = std::max(results.overheadMsec, (results.avgTotalDurationMsec -
+                                                             exeResult.avgDurationMsec));
+
+      // Copy over transfer results
+      for (auto const& resources : exeInfo.resources) {
+        int const transferIdx = resources.transferIdx;
+        TransferResult& tfrResult = results.tfrResults[transferIdx];
+        exeResult.transferIdx.push_back(transferIdx);
+        tfrResult.numBytes = resources.numBytes;
+        tfrResult.avgDurationMsec = resources.totalDurationMsec / numTimedIterations;
+        tfrResult.avgBandwidthGbPerSec = (resources.numBytes / 1.0e6) / tfrResult.avgDurationMsec;
+        if (cfg.general.recordPerIteration) {
+          tfrResult.perIterMsec = resources.perIterMsec;
+          tfrResult.perIterCUs  = resources.perIterCUs;
+        }
+        exeResult.sumBandwidthGbPerSec += tfrResult.avgBandwidthGbPerSec;
+      }
+    }
+    results.avgTotalBandwidthGbPerSec = (results.totalBytesTransferred / 1.0e6) / results.avgTotalDurationMsec;
+
+    // Teardown executors
+    for (auto& exeInfoPair : executorMap) {
+      ExeDevice const& exeDevice = exeInfoPair.first;
+      ExeInfo&         exeInfo   = exeInfoPair.second;
+      ERR_APPEND(TeardownExecutor(cfg, exeDevice, transfers, exeInfo), errResults);
+    }
+
+    return true;
+  }
+
+  int GetIntAttribute(IntAttribute attribute)
+  {
+    switch (attribute) {
+    case ATR_GFX_MAX_BLOCKSIZE: return MAX_BLOCKSIZE;
+    case ATR_GFX_MAX_UNROLL:    return MAX_UNROLL;
+    default:                    return -1;
+    }
+  }
+
+  std::string GetStrAttribute(StrAttribute attribute)
+  {
+    switch (attribute) {
+    case ATR_SRC_PREP_DESCRIPTION:
+      return "Element i = ((i * 517) modulo 383 + 31) * (srcBufferIdx + 1)";
+    default:
+      return "";
+    }
+  }
+
+  ErrResult ParseTransfers(std::string            line,
+                           std::vector<Transfer>& transfers)
+  {
+    // Replace any round brackets or '->' with spaces,
+    for (int i = 1; line[i]; i++)
+      if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
+
+    transfers.clear();
+
+    // Read in number of transfers
+    int numTransfers = 0;
+    std::istringstream iss(line);
+    iss >> numTransfers;
+    if (iss.fail()) return ERR_NONE;
+
+    // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
+    // otherwise read triples (srcMem, exeMem, dstMem)
+    bool const advancedMode = (numTransfers < 0);
+    numTransfers = abs(numTransfers);
+
+    int numSubExecs;
+    std::string srcStr, exeStr, dstStr, numBytesToken;
+
+    if (!advancedMode) {
+      iss >> numSubExecs;
+      if (numSubExecs < 0 || iss.fail()) {
+        return {ERR_FATAL,
+                "Parsing error: Number of blocks to use (%d) must be non-negative", numSubExecs};
+      }
+    }
+
+    for (int i = 0; i < numTransfers; i++) {
+      Transfer transfer;
+
+      if (!advancedMode) {
+        iss >> srcStr >> exeStr >> dstStr;
+        transfer.numSubExecs = numSubExecs;
+        if (iss.fail()) {
+          return {ERR_FATAL,
+                  "Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet", i+1};
+        }
+      } else {
+        iss >> srcStr >> exeStr >> dstStr >> transfer.numSubExecs >> numBytesToken;
+        if (iss.fail()) {
+          return {ERR_FATAL,
+                  "Parsing error: Unable to read valid Transfer %d (SRC EXE DST $CU #Bytes) tuple", i+1};
+        }
+        if (sscanf(numBytesToken.c_str(), "%lu", &transfer.numBytes) != 1) {
+          return {ERR_FATAL,
+                  "Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple", i+1};
+        }
+
+        char units = numBytesToken.back();
+        switch (toupper(units)) {
+        case 'G': transfer.numBytes *= 1024;
+        case 'M': transfer.numBytes *= 1024;
+        case 'K': transfer.numBytes *= 1024;
+        }
+      }
+
+      ERR_CHECK(ParseMemType(srcStr, transfer.srcs));
+      ERR_CHECK(ParseMemType(dstStr, transfer.dsts));
+      ERR_CHECK(ParseExeType(exeStr, transfer.exeDevice, transfer.exeSubIndex));
+
+      transfers.push_back(transfer);
+    }
+    return ERR_NONE;
+  }
+
+  int GetNumExecutors(ExeType exeType)
+  {
+    switch (exeType) {
+    case EXE_CPU:
+      return numa_num_configured_nodes();
+    case EXE_GPU_GFX: case EXE_GPU_DMA:
+    {
+      int numDetectedGpus = 0;
+      hipError_t status = hipGetDeviceCount(&numDetectedGpus);
+      if (status != hipSuccess) numDetectedGpus = 0;
+      return numDetectedGpus;
+    }
+    default:
+      return 0;
+    }
+  }
+
+  int GetNumSubExecutors(ExeDevice exeDevice)
+  {
+    int const& exeIndex = exeDevice.exeIndex;
+
+    switch(exeDevice.exeType) {
+    case EXE_CPU:
+    {
+      int numCores = 0;
+      for (int i = 0; i < numa_num_configured_cpus(); i++)
+        if (numa_node_of_cpu(i) == exeIndex) numCores++;
+      return numCores;
+    }
+    case EXE_GPU_GFX:
+    {
+      int numGpus = GetNumExecutors(EXE_GPU_GFX);
+      if (exeIndex < 0 || numGpus <= exeIndex) return 0;
+      int numDeviceCUs = 0;
+      hipError_t status = hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex);
+      if (status != hipSuccess) numDeviceCUs = 0;
+      return numDeviceCUs;
+    }
+    case EXE_GPU_DMA:
+    {
+      return 1;
+    }
+    default:
+      return 0;
+    }
+  }
+
+  int GetNumExecutorSubIndices(ExeDevice exeDevice)
+  {
+    // Executor subindices are not supported on NVIDIA hardware
+#if defined(__NVCC__)
+    return 0;
+#else
+    int const& exeIndex = exeDevice.exeIndex;
+
+    switch(exeDevice.exeType) {
+    case EXE_CPU: return 0;
+    case EXE_GPU_GFX:
+    {
+      hsa_agent_t agent;
+      ErrResult err = GetHsaAgent(exeDevice, agent);
+      if (err.errType != ERR_NONE) return 0;
+      int numXccs = 1;
+      if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_XCC, &numXccs) != HSA_STATUS_SUCCESS)
+        return 1;
+      return numXccs;
+    }
+    case EXE_GPU_DMA:
+    {
+      std::set<int> engineIds;
+      ErrResult err;
+
+      // Get HSA agent for this GPU
+      hsa_agent_t agent;
+      err = GetHsaAgent(exeDevice, agent);
+      if (err.errType != ERR_NONE) return 0;
+
+      int numTotalEngines = 0, numEnginesA = 0, numEnginesB = 0;
+      if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_ENG, &numEnginesA)
+          == HSA_STATUS_SUCCESS)
+        numTotalEngines += numEnginesA;
+      if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG, &numEnginesB)
+          == HSA_STATUS_SUCCESS)
+        numTotalEngines += numEnginesB;
+
+      return numTotalEngines;
+    }
+    default:
+      return 0;
+    }
+#endif
+  }
+
+  int GetClosestCpuNumaToGpu(int gpuIndex)
+  {
+    // Closest NUMA is not supported on NVIDIA hardware at this time
+#if defined(__NVCC__)
+    return -1;
+#else
+    hsa_agent_t gpuAgent;
+    ErrResult err = GetHsaAgent({EXE_GPU_GFX, gpuIndex}, gpuAgent);
+    if (err.errType != ERR_NONE) return -1;
+
+    hsa_agent_t closestCpuAgent;
+    if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, &closestCpuAgent)
+        == HSA_STATUS_SUCCESS) {
+      int numCpus = GetNumExecutors(EXE_CPU);
+      for (int i = 0; i < numCpus; i++) {
+        hsa_agent_t cpuAgent;
+        err = GetHsaAgent({EXE_CPU, i}, cpuAgent);
+        if (err.errType != ERR_NONE) return -1;
+        if (cpuAgent.handle == closestCpuAgent.handle) return i;
+      }
+    }
+    return -1;
+#endif
+  }
+
+// Undefine CUDA compatibility macros
+#if defined(__NVCC__)
+
+// ROCm specific
+#undef wall_clock64
+#undef gcnArchName
+
+// Datatypes
+#undef hipDeviceProp_t
+#undef hipError_t
+#undef hipEvent_t
+#undef hipStream_t
+
+// Enumerations
+#undef hipDeviceAttributeClockRate
+#undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
+#undef hipDeviceAttributeMultiprocessorCount
+#undef hipErrorPeerAccessAlreadyEnabled
+#undef hipFuncCachePreferShared
+#undef hipMemcpyDefault
+#undef hipMemcpyDeviceToHost
+#undef hipMemcpyHostToDevice
+#undef hipSuccess
+
+// Functions
+#undef hipDeviceCanAccessPeer
+#undef hipDeviceEnablePeerAccess
+#undef hipDeviceGetAttribute
+#undef hipDeviceGetPCIBusId
+#undef hipDeviceSetCacheConfig
+#undef hipDeviceSynchronize
+#undef hipEventCreate
+#undef hipEventDestroy
+#undef hipEventElapsedTime
+#undef hipEventRecord
+#undef hipFree
+#undef hipGetDeviceCount
+#undef hipGetDeviceProperties
+#undef hipGetErrorString
+#undef hipHostFree
+#undef hipHostMalloc
+#undef hipMalloc
+#undef hipMallocManaged
+#undef hipMemcpy
+#undef hipMemcpyAsync
+#undef hipMemset
+#undef hipMemsetAsync
+#undef hipSetDevice
+#undef hipStreamCreate
+#undef hipStreamDestroy
+#undef hipStreamSynchronize
+#endif
+
+// Kernel macros
+#undef GetHwId
+#undef GetXccId
+
+// Undefine helper macros
+#undef ERR_CHECK
+#undef ERR_APPEND
+}