Merge branch 'ROCm:develop' into swraw/docs

513ce1e3 · Swati Rawat · GitHub · ebad0b36 · fb713d03 · 513ce1e3
Unverified Commit 513ce1e3 authored Dec 11, 2024 by Swati Rawat Committed by GitHub Dec 11, 2024
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ _static/
 _templates/
 _toc.yml
 docBin/
+TransferBench
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,59 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.58.00
+### Fixed
+- Fixed broken specific DMA-engine copies
+
+## v1.57.01
+### Added
+- Re-added "scaling" GPU GFX preset benchmark, which tests copies from GPU to other devices using varying
+  number of CUs.
+
+## v1.57.00
+### Modified
+- Removing use of default starship operator / C++20 requirement to enable compilation of more OSs
+- Changing how version is reported.  Client version is now just last two digits, and increments only if
+  no changes are made to the backend header-only library file, and resets to 0 when header is updated
+- GFX_SINGLE_TEAM=0 is set by default
+
+## v1.56
+### Fixed
+- Fixed bug when using interactive mode.  Interactive mode now starts prior to all warmup iterations
+
+## v1.55
+### Fixed
+- Fixed missing header error when compiling on CentOS
+- Fixed issues when using multi-stream mode for GFX executor
+
+## v1.54
+### Modified
+- Refactored TransferBench into a header-only library combined with a thin client to facilitate the
+  use of TransferBench as the backend for other applications
+- Optimized how data validation is handled - this should speed up Tests with many parallel transfers as data is only
+  generated once
+- Preset benchmarks now no longer take in any extra command line arguments.  Preset settings are only accessed via
+  environment variables.  Details for each preset are printed
+- The a2a preset benchmark now defaults to using fine-grained memory and GFX unroll of 2
+- Refactored how Transfers are launched in parallel which has reduced some CPU-side overheads
+- CPU and DMA executor timing now use CPU wall clock timing instead of slowest Transfer time
+### Added
+- New one2all preset which sweeps over all subests of parallel transfers from one GPU to others
+- Adding new warnings for DMA execution relating to how HIP will default to using agents from the source memory
+### Removed
+- CU scaling preset has been removed.  Similar functionality already exists in the schmoo preset benchmark
+- Preparation of source data via GFX kernel has been removed (USE_PREP_KERNEL)
+- Removed GFX block-reordering (BLOCK_ORDER)
+- Removed NUM_CPU_DEVICES and NUM_GPU_DEVICES from common env vars and only into the presets they apply to.
+- Removed SHARED_MEM_BYTES option for GFX executor
+- Removed USE_PCIE_INDEX, and SHARED_MEM_BYTES
+### Fixed
+- Fixed a potential timing reporting issue when DMA executed Transfers end up getting serialized.
+
+## v1.53
+### Added
+- Added ability to specify NULL for sweep preset as source or destination memory type
+
 ## v1.52
 ### Added
 - Added USE_HSA_DMA env var to switch to using hsa_amd_memory_async_copy instead of hipMemcpyAsync for DMA execution

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 # Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
+
 if (DEFINED ENV{ROCM_PATH})
    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
 else()
@@ -6,13 +7,59 @@ else()
 endif()
 cmake_minimum_required(VERSION 3.5)

-project(TransferBench VERSION 1.51.0 LANGUAGES CXX)
+project(TransferBench VERSION 1.58.00 LANGUAGES CXX)
+
+# Default GPU architectures to build
+#==================================================================================================
+set(DEFAULT_GPUS
+      gfx906
+      gfx908
+      gfx90a
+      gfx942
+      gfx1030
+      gfx1100
+      gfx1101
+      gfx1102
+      gfx1200
+      gfx1201)
+
+# Build only for local GPU architecture
+if (BUILD_LOCAL_GPU_TARGET_ONLY)
+  message(STATUS "Building only for local GPU target")
+  if (COMMAND rocm_local_targets)
+    rocm_local_targets(DEFAULT_GPUS)
+  else()
+    message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
+  endif()
+endif()
+
+# Determine which GPU architectures to build for
+set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
+
+# Check if clang compiler can offload to GPU_TARGETS
+if (COMMAND rocm_check_target_ids)
+  message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
+  rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
+else()
+  message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.")
+  set(SUPPORTED_GPUS ${DEFAULT_GPUS})
+endif()
+
+set(COMPILING_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.")
+message(STATUS "Compiling for ${COMPILING_TARGETS}")
+
+foreach(target ${COMPILING_TARGETS})
+ list(APPEND static_link_flags --offload-arch=${target})
+endforeach()
+list(JOIN static_link_flags " " flags_str)
+set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
+
 set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
 include_directories(${ROCM_PATH}/include)
 link_libraries(numa hsa-runtime64 pthread)
-set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..)
-add_executable(TransferBench src/TransferBench.cpp)
-target_include_directories(TransferBench PRIVATE src/include)
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
+add_executable(TransferBench src/client/Client.cpp)
+target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)

 find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
 include(ROCMInstallTargets)

--- a/Makefile
+++ b/Makefile
 #
-# Copyright (c) 2023      Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
 #

-all:
-	cd src ; make
+# Configuration options
+ROCM_PATH ?= /opt/rocm
+CUDA_PATH ?= /usr/local/cuda
+
+HIPCC=$(ROCM_PATH)/bin/hipcc
+NVCC=$(CUDA_PATH)/bin/nvcc
+
+# Compile TransferBenchCuda if nvcc detected
+ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
+	EXE=TransferBenchCuda
+else
+	EXE=TransferBench
+endif
+
+CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
+NVFLAGS  = -x cu -lnuma -arch=native
+COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
+LDFLAGS += -lpthread
+
+all: $(EXE)
+
+TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
+	$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
+
+TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
+	$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)

 clean:
-	cd src ; make clean
+	rm -f *.o ./TransferBench ./TransferBenchCuda
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -8,8 +8,8 @@ import re

 from rocm_docs import ROCmDocs

-with open('../src/include/EnvVars.hpp', encoding='utf-8') as f:
-    match = re.search(r'#define TB_VERSION "([0-9.]+)[^0-9.]+', f.read())
+with open('../src/header/TransferBench.hpp', encoding='utf-8') as f:
+    match = re.search(r'constexpr char VERSION\[\] = "([0-9.]+)[^0-9.]+', f.read())
    if not match:
        raise ValueError("VERSION not found!")
    version_number = match[1]
@@ -18,7 +18,7 @@ left_nav_title = f"TransferBench {version_number} Documentation"
 # for PDF output on Read the Docs
 project = "TransferBench Documentation"
 author = "Advanced Micro Devices, Inc."
-copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
+copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
 version = version_number
 release = version_number


--- a/docs/install/install.rst
+++ b/docs/install/install.rst
@@ -47,7 +47,7 @@ To build documentation locally, use:
 .. code-block:: bash

  cd docs
-  pip3 install -r .sphinx/requirements.txt
+  pip3 install -r ./sphinx/requirements.txt
  python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html

 NVIDIA platform support

--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.8.3
+rocm-docs-core==1.9.2
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -92,7 +92,7 @@ requests==2.32.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.8.3
+rocm-docs-core==1.9.2
    # via -r requirements.in
 smmap==5.0.1
    # via gitdb

--- a/src/Makefile
+++ b/src/Makefile
-# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
-ROCM_PATH ?= /opt/rocm
-CUDA_PATH ?= /usr/local/cuda
-
-HIPCC=$(ROCM_PATH)/bin/hipcc
-NVCC=$(CUDA_PATH)/bin/nvcc
-
-# Compile TransferBenchCuda if nvcc detected
-ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
-	EXE=../TransferBenchCuda
-else
-	EXE=../TransferBench
-endif
-
-CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
-NVFLAGS = -O3 -Iinclude -x cu -lnuma -arch=native
-LDFLAGS    += -lpthread
-all: $(EXE)
-
-../TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp")
-	$(HIPCC) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-
-../TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp")
-	$(NVCC) $(NVFLAGS) $< -o $@ $(LDFLAGS)
-
-clean:
-	rm -f *.o ../TransferBench ../TransferBenchCuda
--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
-/*
-Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-// This program measures simultaneous copy performance across multiple GPUs
-// on the same node
-#include <numa.h>     // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
-#include <cmath>      // If not found, try installing g++-12      (e.g apt-get install g++-12)
-#include <numaif.h>
-#include <random>
-#include <stack>
-#include <thread>
-
-#include "TransferBench.hpp"
-#include "GetClosestNumaNode.hpp"
-
-int main(int argc, char **argv)
-{
-  // Check for NUMA library support
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
-
-  // Display usage instructions and detected topology
-  if (argc <= 1)
-  {
-    int const outputToCsv = EnvVars::GetEnvVar("OUTPUT_TO_CSV", 0);
-    if (!outputToCsv) DisplayUsage(argv[0]);
-    DisplayTopology(outputToCsv);
-    exit(0);
-  }
-
-  // Collect environment variables / display current run configuration
-  EnvVars ev;
-
-  // Determine number of bytes to run per Transfer
-  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
-  if (argc > 2)
-  {
-    // Adjust bytes if unit specified
-    char units = argv[2][strlen(argv[2])-1];
-    switch (units)
-    {
-    case 'K': case 'k': numBytesPerTransfer *= 1024; break;
-    case 'M': case 'm': numBytesPerTransfer *= 1024*1024; break;
-    case 'G': case 'g': numBytesPerTransfer *= 1024*1024*1024; break;
-    }
-  }
-  if (numBytesPerTransfer % 4)
-  {
-    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
-    exit(1);
-  }
-
-  // Check for preset tests
-  // - Tests that sweep across possible sets of Transfers
-  if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
-  {
-    int numGpuSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
-    int numCpuSubExecs = (argc > 4 ? atoi(argv[4]) : 4);
-
-    ev.configMode = CFG_SWEEP;
-    RunSweepPreset(ev, numBytesPerTransfer, numGpuSubExecs, numCpuSubExecs, !strcmp(argv[1], "rsweep"));
-    exit(0);
-  }
-  // - Tests that benchmark peer-to-peer performance
-  else if (!strcmp(argv[1], "p2p"))
-  {
-    ev.configMode = CFG_P2P;
-    RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float));
-    exit(0);
-  }
-  // - Test SubExecutor scaling
-  else if (!strcmp(argv[1], "scaling"))
-  {
-    int maxSubExecs = (argc > 3 ? atoi(argv[3]) : 32);
-    int exeIndex    = (argc > 4 ? atoi(argv[4]) : 0);
-
-    if (exeIndex >= ev.numGpuDevices)
-    {
-      printf("[ERROR] Cannot execute scaling test with GPU device %d\n", exeIndex);
-      exit(1);
-    }
-    ev.configMode = CFG_SCALE;
-    RunScalingBenchmark(ev, numBytesPerTransfer / sizeof(float), exeIndex, maxSubExecs);
-    exit(0);
-  }
-  // - Test all2all benchmark
-  else if (!strcmp(argv[1], "a2a"))
-  {
-    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
-
-    // Force single-stream mode for all-to-all benchmark
-    ev.useSingleStream = 1;
-    ev.configMode = CFG_A2A;
-    RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
-    exit(0);
-  }
-  // Health check
-  else if (!strcmp(argv[1], "healthcheck")) {
-    RunHealthCheck(ev);
-    exit(0);
-  }
-  // - Test schmoo benchmark
-  else if (!strcmp(argv[1], "schmoo"))
-  {
-    if (ev.numGpuDevices < 2)
-    {
-      printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
-      exit(1);
-    }
-    ev.configMode = CFG_SCHMOO;
-
-    int localIdx    = (argc > 3 ? atoi(argv[3]) : 0);
-    int remoteIdx   = (argc > 4 ? atoi(argv[4]) : 1);
-    int maxSubExecs = (argc > 5 ? atoi(argv[5]) : 32);
-
-    if (localIdx >= ev.numGpuDevices || remoteIdx >= ev.numGpuDevices)
-    {
-      printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
-      exit(1);
-    }
-    ev.DisplaySchmooEnvVars();
-
-    for (int N = 256; N <= (1<<27); N *= 2)
-    {
-      int delta = std::max(1, N / ev.samplingFactor);
-      int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
-      do
-      {
-        RunSchmooBenchmark(ev, curr * sizeof(float), localIdx, remoteIdx, maxSubExecs);
-        if (numBytesPerTransfer != 0) exit(0);
-        curr += delta;
-      } while (curr < N * 2);
-    }
-  }
-  else if (!strcmp(argv[1], "rwrite"))
-  {
-    if (ev.numGpuDevices < 2)
-    {
-      printf("[ERROR] Remote write benchmark requires at least 2 GPUs\n");
-      exit(1);
-    }
-    ev.DisplayRemoteWriteEnvVars();
-
-    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
-    int srcIdx      = (argc > 4 ? atoi(argv[4]) : 0);
-    int minGpus     = (argc > 5 ? atoi(argv[5]) : 1);
-    int maxGpus     = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
-
-    for (int N = 256; N <= (1<<27); N *= 2)
-    {
-      int delta = std::max(1, N / ev.samplingFactor);
-      int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
-      do
-      {
-        RunRemoteWriteBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
-        if (numBytesPerTransfer != 0) exit(0);
-        curr += delta;
-      } while (curr < N * 2);
-    }
-  }
-  else if (!strcmp(argv[1], "pcopy"))
-  {
-    if (ev.numGpuDevices < 2)
-    {
-      printf("[ERROR] Parallel copy benchmark requires at least 2 GPUs\n");
-      exit(1);
-    }
-    ev.DisplayParallelCopyEnvVars();
-
-    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8);
-    int srcIdx      = (argc > 4 ? atoi(argv[4]) : 0);
-    int minGpus     = (argc > 5 ? atoi(argv[5]) : 1);
-    int maxGpus     = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
-
-    if (maxGpus > ev.gpuMaxHwQueues && ev.useDmaCopy)
-    {
-      printf("[ERROR] DMA executor %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
-             srcIdx, maxGpus, ev.gpuMaxHwQueues);
-      printf("[ERROR] Aborting to avoid misleading results due to potential serialization of Transfers\n");
-      exit(1);
-    }
-
-    for (int N = 256; N <= (1<<27); N *= 2)
-    {
-      int delta = std::max(1, N / ev.samplingFactor);
-      int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
-      do
-      {
-        RunParallelCopyBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
-        if (numBytesPerTransfer != 0) exit(0);
-        curr += delta;
-      } while (curr < N * 2);
-    }
-  }
-  else if (!strcmp(argv[1], "cmdline"))
-  {
-    // Print environment variables
-    ev.DisplayEnvVars();
-
-    // Read Transfer from command line
-    std::string cmdlineTransfer;
-    for (int i = 3; i < argc; i++)
-      cmdlineTransfer += std::string(argv[i]) + " ";
-
-    char line[MAX_LINE_LEN];
-    sprintf(line, "%s", cmdlineTransfer.c_str());
-    std::vector<Transfer> transfers;
-    ParseTransfers(ev, line, transfers);
-    if (transfers.empty()) exit(0);
-
-    // If the number of bytes is specified, use it
-    if (numBytesPerTransfer != 0)
-    {
-      size_t N = numBytesPerTransfer / sizeof(float);
-      ExecuteTransfers(ev, 1, N, transfers);
-    }
-    else
-    {
-      // Otherwise generate a range of values
-      for (int N = 256; N <= (1<<27); N *= 2)
-      {
-        int delta = std::max(1, N / ev.samplingFactor);
-        int curr = N;
-        while (curr < N * 2)
-        {
-          ExecuteTransfers(ev, 1, curr, transfers);
-          curr += delta;
-        }
-      }
-    }
-    exit(0);
-  }
-
-  // Check that Transfer configuration file can be opened
-  ev.configMode = CFG_FILE;
-  FILE* fp = fopen(argv[1], "r");
-  if (!fp)
-  {
-    printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
-    exit(1);
-  }
-
-  // Print environment variables and CSV header
-  ev.DisplayEnvVars();
-
-  int testNum = 0;
-  char line[MAX_LINE_LEN];
-  while(fgets(line, MAX_LINE_LEN, fp))
-  {
-    // Check if line is a comment to be echoed to output (starts with ##)
-    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);
-
-    // Parse set of parallel Transfers to execute
-    std::vector<Transfer> transfers;
-    ParseTransfers(ev, line, transfers);
-    if (transfers.empty()) continue;
-
-    // If the number of bytes is specified, use it
-    if (numBytesPerTransfer != 0)
-    {
-      size_t N = numBytesPerTransfer / sizeof(float);
-      ExecuteTransfers(ev, ++testNum, N, transfers);
-    }
-    else
-    {
-      // Otherwise generate a range of values
-      for (int N = 256; N <= (1<<27); N *= 2)
-      {
-        int delta = std::max(1, N / ev.samplingFactor);
-        int curr = N;
-        while (curr < N * 2)
-        {
-          ExecuteTransfers(ev, ++testNum, curr, transfers);
-          curr += delta;
-        }
-      }
-    }
-  }
-  fclose(fp);
-
-  return 0;
-}
-
-void ExecuteTransfers(EnvVars const& ev,
-                      int const testNum,
-                      size_t const N,
-                      std::vector<Transfer>& transfers,
-                      bool verbose,
-                      double* totalBandwidthCpu)
-{
-  // Check for any Transfers using variable number of sub-executors
-  std::vector<int> varTransfers;
-  std::vector<int> numUsedSubExec(ev.numGpuDevices, 0);
-  std::vector<int> numVarSubExec(ev.numGpuDevices, 0);
-
-  for (int i = 0; i < transfers.size(); i++) {
-    Transfer& t = transfers[i];
-    t.transferIndex = i;
-    t.numBytesActual = (t.numBytes ? t.numBytes : N * sizeof(float));
-
-    if (t.exeType == EXE_GPU_GFX) {
-      if (t.numSubExecs == 0) {
-        varTransfers.push_back(i);
-        numVarSubExec[t.exeIndex]++;
-      } else {
-        numUsedSubExec[t.exeIndex] += t.numSubExecs;
-      }
-    } else if (t.numSubExecs == 0) {
-      printf("[ERROR] Variable subexecutor count is only supported for GFX executor\n");
-      exit(1);
-    }
-  }
-
-  if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
-
-  TestResults testResults;
-  if (varTransfers.size() == 0) {
-    testResults = ExecuteTransfersImpl(ev, transfers);
-  } else {
-    // Determine maximum number of subexecutors
-    int maxNumSubExec = 0;
-    if (ev.maxNumVarSubExec) {
-      maxNumSubExec = ev.maxNumVarSubExec;
-    } else {
-      HIP_CALL(hipDeviceGetAttribute(&maxNumSubExec, hipDeviceAttributeMultiprocessorCount, 0));
-      for (int device = 0; device < ev.numGpuDevices; device++) {
-        int numSubExec = 0;
-        HIP_CALL(hipDeviceGetAttribute(&numSubExec, hipDeviceAttributeMultiprocessorCount, device));
-        int leftOverSubExec = numSubExec - numUsedSubExec[device];
-        if (leftOverSubExec < numVarSubExec[device])
-          maxNumSubExec = 1;
-        else if (numVarSubExec[device] != 0) {
-          maxNumSubExec = std::min(maxNumSubExec, leftOverSubExec / numVarSubExec[device]);
-        }
-      }
-    }
-
-    // Loop over subexecs
-    std::vector<Transfer> bestTransfers;
-    for (int numSubExec = ev.minNumVarSubExec; numSubExec <= maxNumSubExec; numSubExec++) {
-      std::vector<Transfer> currTransfers = transfers;
-      for (auto idx : varTransfers) {
-        currTransfers[idx].numSubExecs = numSubExec;
-      }
-      TestResults tempResults = ExecuteTransfersImpl(ev, currTransfers);
-      if (tempResults.totalBandwidthCpu > testResults.totalBandwidthCpu) {
-        bestTransfers = currTransfers;
-        testResults = tempResults;
-      }
-    }
-    transfers = bestTransfers;
-  }
-  if (totalBandwidthCpu) *totalBandwidthCpu = testResults.totalBandwidthCpu;
-
-  if (verbose) {
-    ReportResults(ev, transfers, testResults);
-  }
-}
-
-TestResults ExecuteTransfersImpl(EnvVars const& ev,
-                                 std::vector<Transfer>& transfers)
-{
-  // Map transfers by executor
-  TransferMap transferMap;
-  for (int i = 0; i < transfers.size(); i++)
-  {
-    Transfer& transfer = transfers[i];
-    transfer.transferIndex = i;
-    Executor executor(transfer.exeType, transfer.exeIndex);
-    ExecutorInfo& executorInfo = transferMap[executor];
-    executorInfo.transfers.push_back(&transfer);
-  }
-
-  // Loop over each executor and prepare sub-executors
-  std::map<int, Transfer*> transferList;
-  for (auto& exeInfoPair : transferMap)
-  {
-    Executor const& executor = exeInfoPair.first;
-    ExecutorInfo& exeInfo    = exeInfoPair.second;
-    ExeType const exeType    = executor.first;
-    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
-
-    exeInfo.totalTime = 0.0;
-    exeInfo.totalSubExecs = 0;
-
-    // Loop over each transfer this executor is involved in
-    for (Transfer* transfer : exeInfo.transfers)
-    {
-      // Allocate source memory
-      transfer->srcMem.resize(transfer->numSrcs);
-      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
-      {
-        MemType const& srcType  = transfer->srcType[iSrc];
-        int     const  srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
-
-        // Ensure executing GPU can access source memory
-        if (IsGpuType(exeType) && IsGpuType(srcType) && srcIndex != exeIndex)
-          EnablePeerAccess(exeIndex, srcIndex);
-
-        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
-      }
-
-      // Allocate destination memory
-      transfer->dstMem.resize(transfer->numDsts);
-      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
-      {
-        MemType const& dstType  = transfer->dstType[iDst];
-        int     const  dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
-
-        // Ensure executing GPU can access destination memory
-        if (IsGpuType(exeType) && IsGpuType(dstType) && dstIndex != exeIndex)
-          EnablePeerAccess(exeIndex, dstIndex);
-
-        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
-      }
-
-      exeInfo.totalSubExecs += transfer->numSubExecs;
-      transferList[transfer->transferIndex] = transfer;
-    }
-
-    // Prepare additional requirement for GPU-based executors
-    if (IsGpuType(exeType))
-    {
-      HIP_CALL(hipSetDevice(exeIndex));
-
-      // Single-stream is only supported for GFX-based executors
-      int const numStreamsToUse = (exeType == EXE_GPU_DMA || !ev.useSingleStream) ? exeInfo.transfers.size() : 1;
-      exeInfo.streams.resize(numStreamsToUse);
-      exeInfo.startEvents.resize(numStreamsToUse);
-      exeInfo.stopEvents.resize(numStreamsToUse);
-      for (int i = 0; i < numStreamsToUse; ++i)
-      {
-        if (ev.cuMask.size())
-        {
-#if !defined(__NVCC__)
-          HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data()));
-#else
-          printf("[ERROR] CU Masking in not supported on NVIDIA hardware\n");
-          exit(-1);
-#endif
-        }
-        else
-        {
-          HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
-        }
-        HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
-        HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
-      }
-
-      if (exeType == EXE_GPU_GFX)
-      {
-        // Allocate one contiguous chunk of GPU memory for threadblock parameters
-        // This allows support for executing one transfer per stream, or all transfers in a single stream
-#if !defined(__NVCC__)
-        AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
-                       (void**)&exeInfo.subExecParamGpu);
-#else
-        AllocateMemory(MEM_MANAGED, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
-                       (void**)&exeInfo.subExecParamGpu);
-#endif
-
-        // Check for sufficient subExecutors
-        int numDeviceCUs = 0;
-        HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex));
-        if (exeInfo.totalSubExecs > numDeviceCUs)
-        {
-          printf("[WARN] GFX executor %d requesting %d total subexecutors, however only has %d.  Some Transfers may be serialized\n",
-                 exeIndex, exeInfo.totalSubExecs, numDeviceCUs);
-        }
-      }
-
-      // Check for targeted DMA
-      if (exeType == EXE_GPU_DMA)
-      {
-        bool useRandomDma = false;
-        bool useTargetDma = false;
-
-        // Check for sufficient hardware queues
-#if !defined(__NVCC_)
-        if (exeInfo.transfers.size() > ev.gpuMaxHwQueues)
-        {
-          printf("[WARN] DMA executor %d attempting %lu parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
-                 exeIndex, exeInfo.transfers.size(), ev.gpuMaxHwQueues);
-        }
-#endif
-
-        for (Transfer* transfer : exeInfo.transfers)
-        {
-          if (transfer->exeSubIndex != -1 || ev.useHsaDma)
-          {
-            useTargetDma = (transfer->exeSubIndex != -1);
-
-#if defined(__NVCC__)
-            printf("[ERROR] DMA executor subindex not supported on NVIDIA hardware\n");
-            exit(-1);
-#else
-            if (transfer->numSrcs != 1 || transfer->numDsts != 1)
-            {
-              printf("[ERROR] DMA Transfer must have at exactly one source and one destination");
-              exit(1);
-            }
-
-            // Collect HSA agent information
-
-            hsa_amd_pointer_info_t info;
-            info.size = sizeof(info);
-
-            HSA_CHECK(hsa_amd_pointer_info(transfer->dstMem[0], &info, NULL, NULL, NULL));
-            transfer->dstAgent = info.agentOwner;
-
-            HSA_CHECK(hsa_amd_pointer_info(transfer->srcMem[0], &info, NULL, NULL, NULL));
-            transfer->srcAgent = info.agentOwner;
-
-            // Create HSA completion signal
-            HSA_CHECK(hsa_signal_create(1, 0, NULL, &transfer->signal));
-
-            // Check for valid engine Id
-            if (transfer->exeSubIndex < -1 || transfer->exeSubIndex >= 32)
-            {
-              printf("[ERROR] DMA executor subindex must be between 0 and 31\n");
-              exit(1);
-            }
-
-            // Check that engine Id exists between agents
-            if (useTargetDma) {
-              uint32_t engineIdMask = 0;
-              HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent,
-                                                          transfer->srcAgent,
-                                                          &engineIdMask));
-              transfer->sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << transfer->exeSubIndex);
-              if (!(transfer->sdmaEngineId & engineIdMask))
-              {
-                printf("[ERROR] DMA executor %d.%d does not exist or cannot copy between source %s to destination %s\n",
-                       transfer->exeIndex, transfer->exeSubIndex,
-                       transfer->SrcToStr().c_str(),
-                       transfer->DstToStr().c_str());
-                exit(1);
-              }
-            }
-#endif
-          }
-          else
-          {
-            useRandomDma = true;
-          }
-        }
-        if (useRandomDma && useTargetDma)
-        {
-          printf("[WARN] Mixing targeted and untargetted DMA execution on GPU %d may result in resource conflicts\n",
-            exeIndex);
-        }
-      }
-    }
-  }
-
-
-  // Prepare input memory and block parameters for current N
-  bool isSrcCorrect = true;
-  for (auto& exeInfoPair : transferMap)
-  {
-    Executor const& executor = exeInfoPair.first;
-    ExecutorInfo& exeInfo    = exeInfoPair.second;
-    ExeType const exeType    = executor.first;
-    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
-
-    exeInfo.totalBytes = 0;
-    for (int i = 0; i < exeInfo.transfers.size(); ++i)
-    {
-      // Prepare subarrays each threadblock works on and fill src memory with patterned data
-      Transfer* transfer = exeInfo.transfers[i];
-      transfer->PrepareSubExecParams(ev);
-      isSrcCorrect &= transfer->PrepareSrc(ev);
-      exeInfo.totalBytes += transfer->numBytesActual;
-    }
-
-    // Copy block parameters to GPU for GPU executors
-    if (exeType == EXE_GPU_GFX)
-    {
-      std::vector<SubExecParam> tempSubExecParam;
-
-      if (!ev.useSingleStream || (ev.blockOrder == ORDER_SEQUENTIAL))
-      {
-        // Assign Transfers to sequentual threadblocks
-        int transferOffset = 0;
-        for (Transfer* transfer : exeInfo.transfers)
-        {
-          transfer->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
-
-          transfer->subExecIdx.clear();
-          for (int subExecIdx = 0; subExecIdx < transfer->subExecParam.size(); subExecIdx++)
-          {
-            transfer->subExecIdx.push_back(transferOffset + subExecIdx);
-            tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
-          }
-          transferOffset += transfer->numSubExecs;
-        }
-      }
-      else if (ev.blockOrder == ORDER_INTERLEAVED)
-      {
-        // Interleave threadblocks of different Transfers
-        exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
-        for (int subExecIdx = 0; tempSubExecParam.size() < exeInfo.totalSubExecs; ++subExecIdx)
-        {
-          for (Transfer* transfer : exeInfo.transfers)
-          {
-            if (subExecIdx < transfer->numSubExecs)
-            {
-              transfer->subExecIdx.push_back(tempSubExecParam.size());
-              tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
-            }
-          }
-        }
-      }
-      else if (ev.blockOrder == ORDER_RANDOM)
-      {
-        std::vector<std::pair<int,int>> indices;
-        exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
-
-        // Build up a list of (transfer,subExecParam) indices, then randomly sort them
-        for (int i = 0; i < exeInfo.transfers.size(); i++)
-        {
-          Transfer* transfer = exeInfo.transfers[i];
-          for (int subExecIdx = 0; subExecIdx < transfer->numSubExecs; subExecIdx++)
-            indices.push_back(std::make_pair(i, subExecIdx));
-        }
-        std::shuffle(indices.begin(), indices.end(), *ev.generator);
-
-        // Build randomized threadblock list
-        for (auto p : indices)
-        {
-          Transfer* transfer = exeInfo.transfers[p.first];
-          transfer->subExecIdx.push_back(tempSubExecParam.size());
-          tempSubExecParam.push_back(transfer->subExecParam[p.second]);
-        }
-      }
-
-      HIP_CALL(hipSetDevice(exeIndex));
-      HIP_CALL(hipMemcpy(exeInfo.subExecParamGpu,
-                         tempSubExecParam.data(),
-                         tempSubExecParam.size() * sizeof(SubExecParam),
-                         hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-    }
-  }
-
-  // Launch kernels (warmup iterations are not counted)
-  double totalCpuTime = 0;
-  size_t numTimedIterations = 0;
-  std::stack<std::thread> threads;
-  for (int iteration = -ev.numWarmups; isSrcCorrect; iteration++)
-  {
-    if (ev.numIterations > 0 && iteration    >= ev.numIterations) break;
-    if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
-
-    // Pause before starting first timed iteration in interactive mode
-    if (ev.useInteractive && iteration == 0)
-    {
-      printf("Memory prepared:\n");
-
-      for (Transfer& transfer : transfers)
-      {
-        printf("Transfer %03d:\n", transfer.transferIndex);
-        for (int iSrc = 0; iSrc < transfer.numSrcs; ++iSrc)
-          printf("  SRC %0d: %p\n", iSrc, transfer.srcMem[iSrc]);
-        for (int iDst = 0; iDst < transfer.numDsts; ++iDst)
-          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
-      }
-      printf("Hit <Enter> to continue: ");
-      if (scanf("%*c") != 0)
-      {
-        printf("[ERROR] Unexpected input\n");
-        exit(1);
-      }
-      printf("\n");
-    }
-
-    // Start CPU timing for this iteration
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-
-    // Execute all Transfers in parallel
-    for (auto& exeInfoPair : transferMap)
-    {
-      ExecutorInfo& exeInfo = exeInfoPair.second;
-      ExeType       exeType = exeInfoPair.first.first;
-      int const numTransfersToRun = (exeType == EXE_GPU_GFX && ev.useSingleStream) ? 1 : exeInfo.transfers.size();
-
-      for (int i = 0; i < numTransfersToRun; ++i)
-        threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
-    }
-
-    // Wait for all threads to finish
-    int const numTransfers = threads.size();
-    for (int i = 0; i < numTransfers; i++)
-    {
-      threads.top().join();
-      threads.pop();
-    }
-
-    // Stop CPU timing for this iteration
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-
-    if (ev.alwaysValidate)
-    {
-      for (auto transferPair : transferList)
-      {
-        Transfer* transfer = transferPair.second;
-        transfer->ValidateDst(ev);
-      }
-    }
-
-    if (iteration >= 0)
-    {
-      ++numTimedIterations;
-      totalCpuTime += deltaSec;
-    }
-  }
-
-  // Pause for interactive mode
-  if (isSrcCorrect && ev.useInteractive)
-  {
-    printf("Transfers complete. Hit <Enter> to continue: ");
-    if (scanf("%*c") != 0)
-    {
-      printf("[ERROR] Unexpected input\n");
-      exit(1);
-    }
-    printf("\n");
-  }
-
-  // Validate that each transfer has transferred correctly
-  size_t totalBytesTransferred = 0;
-  int const numTransfers = transferList.size();
-  for (auto transferPair : transferList)
-  {
-    Transfer* transfer = transferPair.second;
-    transfer->ValidateDst(ev);
-    totalBytesTransferred += transfer->numBytesActual;
-  }
-
-  // Record results
-  TestResults testResults;
-  testResults.numTimedIterations = numTimedIterations;
-  testResults.totalBytesTransferred = totalBytesTransferred;
-  testResults.totalDurationMsec = totalCpuTime / (1.0 * numTimedIterations * ev.numSubIterations) * 1000;
-  testResults.totalBandwidthCpu = (totalBytesTransferred / 1.0E6) / testResults.totalDurationMsec;
-
-  double maxExeDurationMsec = 0.0;
-  if (!isSrcCorrect) goto cleanup;
-
-  for (auto& exeInfoPair : transferMap)
-  {
-    ExecutorInfo  exeInfo  = exeInfoPair.second;
-    ExeType const exeType  = exeInfoPair.first.first;
-    int     const exeIndex = exeInfoPair.first.second;
-    ExeResult& exeResult   = testResults.exeResults[std::make_pair(exeType, exeIndex)];
-
-    // Compute total time for non GPU executors
-    if (exeType != EXE_GPU_GFX || ev.useSingleStream == 0)
-    {
-      exeInfo.totalTime = 0;
-      for (auto const& transfer : exeInfo.transfers)
-        exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
-    }
-
-    exeResult.totalBytes      = exeInfo.totalBytes;
-    exeResult.durationMsec    = exeInfo.totalTime / (1.0 * numTimedIterations * ev.numSubIterations);
-    exeResult.bandwidthGbs    = (exeInfo.totalBytes / 1.0E9) / exeResult.durationMsec * 1000.0f;
-    exeResult.sumBandwidthGbs = 0;
-    maxExeDurationMsec        = std::max(maxExeDurationMsec, exeResult.durationMsec);
-
-    for (auto& transfer: exeInfo.transfers)
-    {
-      exeResult.transferIdx.push_back(transfer->transferIndex);
-      transfer->transferTime /= (1.0 * numTimedIterations * ev.numSubIterations);
-      transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
-      transfer->executorBandwidth = exeResult.bandwidthGbs;
-      exeResult.sumBandwidthGbs += transfer->transferBandwidth;
-    }
-  }
-  testResults.overheadMsec = testResults.totalDurationMsec - maxExeDurationMsec;
-
-  // Release GPU memory
-cleanup:
-  for (auto exeInfoPair : transferMap)
-  {
-    ExecutorInfo& exeInfo  = exeInfoPair.second;
-    ExeType const exeType  = exeInfoPair.first.first;
-    int     const exeIndex = RemappedIndex(exeInfoPair.first.second, IsCpuType(exeType));
-
-    for (auto& transfer : exeInfo.transfers)
-    {
-      for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
-      {
-        MemType const& srcType = transfer->srcType[iSrc];
-        DeallocateMemory(srcType, transfer->srcMem[iSrc], transfer->numBytesActual + ev.byteOffset);
-      }
-      for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
-      {
-        MemType const& dstType = transfer->dstType[iDst];
-        DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
-      }
-      transfer->subExecParam.clear();
-
-      if (exeType == EXE_GPU_DMA && (transfer->exeSubIndex != -1 || ev.useHsaDma))
-      {
-#if !defined(__NVCC__)
-        HSA_CHECK(hsa_signal_destroy(transfer->signal));
-#endif
-      }
-    }
-
-    if (IsGpuType(exeType))
-    {
-      int const numStreams = (int)exeInfo.streams.size();
-      for (int i = 0; i < numStreams; ++i)
-      {
-        HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
-        HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
-        HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
-      }
-
-      if (exeType == EXE_GPU_GFX)
-      {
-#if !defined(__NVCC__)
-        DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
-#else
-        DeallocateMemory(MEM_MANAGED, exeInfo.subExecParamGpu);
-#endif
-      }
-    }
-  }
-
-  return testResults;
-}
-
-void DisplayUsage(char const* cmdName)
-{
-  printf("TransferBench v%s\n", TB_VERSION);
-  printf("========================================\n");
-
-  if (numa_available() == -1)
-  {
-    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
-    exit(1);
-  }
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-  int const numCpuDevices = numa_num_configured_nodes();
-
-  printf("Usage: %s config <N>\n", cmdName);
-  printf("  config: Either:\n");
-  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
-  printf("          - Name of preset config:\n");
-  printf("              a2a          - GPU All-To-All benchmark\n");
-  printf("                             - 3rd optional arg: # of SubExecs to use\n");
-  printf("              cmdline      - Read Transfers from command line arguments (after N)\n");
-  printf("              healthcheck  - Simple bandwidth health check (MI300 series only)\n");
-  printf("              p2p          - Peer-to-peer benchmark tests\n");
-  printf("              rwrite/pcopy - Parallel writes/copies from single GPU to other GPUs\n");
-  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
-  printf("                             - 4th optional arg: Root GPU index\n");
-  printf("                             - 5th optional arg: Min number of other GPUs to transfer to\n");
-  printf("                             - 6th optional arg: Max number of other GPUs to transfer to\n");
-  printf("              sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
-  printf("                             - 3rd optional arg: # GPU SubExecs per Transfer\n");
-  printf("                             - 4th optional arg: # CPU SubExecs per Transfer\n");
-  printf("              scaling      - GPU GFX SubExec scaling copy test\n");
-  printf("                             - 3th optional arg: Max # of SubExecs to use\n");
-  printf("                             - 4rd optional arg: GPU index to use as executor\n");
-  printf("              schmoo       - Local/RemoteRead/Write/Copy between two GPUs\n");
-  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
-  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
-         DEFAULT_BYTES_PER_TRANSFER);
-  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
-  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
-  printf("\n");
-
-  EnvVars::DisplayUsage();
-}
-
-int RemappedIndex(int const origIdx, bool const isCpuType)
-{
-  static std::vector<int> remappingCpu;
-  static std::vector<int> remappingGpu;
-
-  // Build CPU remapping on first use
-  // Skip numa nodes that are not configured
-  if (remappingCpu.empty())
-  {
-    for (int node = 0; node <= numa_max_node(); node++)
-      if (numa_bitmask_isbitset(numa_get_mems_allowed(), node))
-        remappingCpu.push_back(node);
-  }
-
-  // Build remappingGpu on first use
-  if (remappingGpu.empty())
-  {
-    int numGpuDevices;
-    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-    remappingGpu.resize(numGpuDevices);
-
-    int const usePcieIndexing = getenv("USE_PCIE_INDEX") ? atoi(getenv("USE_PCIE_INDEX")) : 0;
-    if (!usePcieIndexing)
-    {
-      // For HIP-based indexing no remappingGpu is necessary
-      for (int i = 0; i < numGpuDevices; ++i)
-        remappingGpu[i] = i;
-    }
-    else
-    {
-      // Collect PCIe address for each GPU
-      std::vector<std::pair<std::string, int>> mapping;
-      char pciBusId[20];
-      for (int i = 0; i < numGpuDevices; ++i)
-      {
-        HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
-        mapping.push_back(std::make_pair(pciBusId, i));
-      }
-      // Sort GPUs by PCIe address then use that as mapping
-      std::sort(mapping.begin(), mapping.end());
-      for (int i = 0; i < numGpuDevices; ++i)
-        remappingGpu[i] = mapping[i].second;
-    }
-  }
-  return isCpuType ? remappingCpu[origIdx] : remappingGpu[origIdx];
-}
-
-void DisplayTopology(bool const outputToCsv)
-{
-
-  int numCpuDevices = numa_num_configured_nodes();
-  int numGpuDevices;
-  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-
-  if (outputToCsv)
-  {
-    printf("NumCpus,%d\n", numCpuDevices);
-    printf("NumGpus,%d\n", numGpuDevices);
-  }
-  else
-  {
-    printf("\nDetected topology: %d configured CPU NUMA node(s) [%d total]   %d GPU device(s)\n",
-           numa_num_configured_nodes(), numa_max_node() + 1, numGpuDevices);
-  }
-
-  // Print out detected CPU topology
-  if (outputToCsv)
-  {
-    printf("NUMA");
-    for (int j = 0; j < numCpuDevices; j++)
-      printf(",NUMA%02d", j);
-    printf(",# CPUs,ClosestGPUs,ActualNode\n");
-  }
-  else
-  {
-    printf("            |");
-    for (int j = 0; j < numCpuDevices; j++)
-      printf("NUMA %02d|", j);
-    printf(" #Cpus | Closest GPU(s)\n");
-
-    printf("------------+");
-    for (int j = 0; j <= numCpuDevices; j++)
-      printf("-------+");
-    printf("---------------\n");
-  }
-
-  for (int i = 0; i < numCpuDevices; i++)
-  {
-    int nodeI = RemappedIndex(i, true);
-    printf("NUMA %02d (%02d)%s", i, nodeI, outputToCsv ? "," : "|");
-    for (int j = 0; j < numCpuDevices; j++)
-    {
-      int nodeJ = RemappedIndex(j, true);
-      int numaDist = numa_distance(nodeI, nodeJ);
-      if (outputToCsv)
-        printf("%d,", numaDist);
-      else
-        printf(" %5d |", numaDist);
-    }
-
-    int numCpus = 0;
-    for (int j = 0; j < numa_num_configured_cpus(); j++)
-      if (numa_node_of_cpu(j) == nodeI) numCpus++;
-    if (outputToCsv)
-      printf("%d,", numCpus);
-    else
-      printf(" %5d | ", numCpus);
-
-#if !defined(__NVCC__)
-    bool isFirst = true;
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      if (GetClosestNumaNode(RemappedIndex(j, false)) == i)
-      {
-        if (isFirst) isFirst = false;
-        else printf(",");
-        printf("%d", j);
-      }
-    }
-#endif
-    printf("\n");
-  }
-  printf("\n");
-
-#if defined(__NVCC__)
-
-  for (int i = 0; i < numGpuDevices; i++)
-  {
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, i));
-    printf(" GPU %02d | %s\n", i, prop.name);
-  }
-
-  // No further topology detection done for NVIDIA platforms
-  return;
-#else
-    // Figure out DMA engines per GPU
-  std::vector<std::set<int>> dmaEngineIdsPerDevice(numGpuDevices);
-  {
-    std::vector<hsa_agent_t> gpuAgentList;
-    std::vector<hsa_agent_t> allAgentList;
-    hsa_amd_pointer_info_t info;
-    info.size = sizeof(info);
-
-    for (int deviceId = 0; deviceId < numGpuDevices; deviceId++)
-    {
-      HIP_CALL(hipSetDevice(deviceId));
-      int32_t* tempGpuBuffer;
-      HIP_CALL(hipMalloc((void**)&tempGpuBuffer, 1024));
-      HSA_CHECK(hsa_amd_pointer_info(tempGpuBuffer, &info, NULL, NULL, NULL));
-      gpuAgentList.push_back(info.agentOwner);
-      allAgentList.push_back(info.agentOwner);
-      HIP_CALL(hipFree(tempGpuBuffer));
-    }
-    for (int deviceId = 0; deviceId < numCpuDevices; deviceId++)
-    {
-      int32_t* tempCpuBuffer;
-      AllocateMemory(MEM_CPU, deviceId, 1024, (void**)&tempCpuBuffer);
-      HSA_CHECK(hsa_amd_pointer_info(tempCpuBuffer, &info, NULL, NULL, NULL));
-      allAgentList.push_back(info.agentOwner);
-      DeallocateMemory(MEM_CPU, tempCpuBuffer, 1024);
-    }
-
-    for (int srcDevice = 0; srcDevice < numGpuDevices; srcDevice++)
-    {
-      dmaEngineIdsPerDevice[srcDevice].clear();
-      for (int dstDevice = 0; dstDevice < allAgentList.size(); dstDevice++)
-      {
-        if (srcDevice == dstDevice) continue;
-        uint32_t engineIdMask = 0;
-        if (hsa_amd_memory_copy_engine_status(allAgentList[dstDevice],
-                                              gpuAgentList[srcDevice],
-                                              &engineIdMask) != HSA_STATUS_SUCCESS)
-          continue;
-        for (int engineId = 0; engineId < 32; engineId++)
-        {
-          if (engineIdMask & (1U << engineId))
-            dmaEngineIdsPerDevice[srcDevice].insert(engineId);
-        }
-      }
-    }
-  }
-
-  // Print out detected GPU topology
-  if (outputToCsv)
-  {
-    printf("GPU");
-    for (int j = 0; j < numGpuDevices; j++)
-      printf(",GPU %02d", j);
-    printf(",PCIe Bus ID,ClosestNUMA,DMA engines\n");
-  }
-  else
-  {
-    printf("        |");
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, j));
-      std::string fullName = prop.gcnArchName;
-      std::string archName = fullName.substr(0, fullName.find(':'));
-      printf(" %6s |", archName.c_str());
-    }
-    printf("\n");
-    printf("        |");
-    for (int j = 0; j < numGpuDevices; j++)
-      printf(" GPU %02d |", j);
-    printf(" PCIe Bus ID  | #CUs | Closest NUMA | DMA engines\n");
-    for (int j = 0; j <= numGpuDevices; j++)
-      printf("--------+");
-    printf("--------------+------+-------------+------------\n");
-  }
-
-  char pciBusId[20];
-  for (int i = 0; i < numGpuDevices; i++)
-  {
-    int const deviceIdx = RemappedIndex(i, false);
-    printf("%sGPU %02d%s", outputToCsv ? "" : " ", i, outputToCsv ? "," : " |");
-    for (int j = 0; j < numGpuDevices; j++)
-    {
-      if (i == j)
-      {
-        if (outputToCsv)
-          printf("-,");
-        else
-          printf("    -   |");
-      }
-      else
-      {
-        uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(deviceIdx,
-                                              RemappedIndex(j, false),
-                                              &linkType, &hopCount));
-        printf("%s%s-%d%s",
-               outputToCsv ? "" : " ",
-               linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "  HT" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? " QPI" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFB" :
-               linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "????",
-               hopCount, outputToCsv ? "," : " |");
-      }
-    }
-    HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, deviceIdx));
-
-    int numDeviceCUs = 0;
-    HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
-
-    if (outputToCsv)
-      printf("%s,%d,%d,", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
-    else
-    {
-      printf(" %11s | %4d | %-12d |", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
-
-      bool isFirst = true;
-      for (auto x : dmaEngineIdsPerDevice[deviceIdx])
-      {
-        if (isFirst) isFirst = false; else printf(",");
-        printf("%d", x);
-      }
-      printf("\n");
-    }
-  }
-
-  // Check that large BAR is enabled on all GPUs
-  for (int i = 0; i < numGpuDevices; i++) {
-    int const deviceIdx = RemappedIndex(i, false);
-    int isLargeBar = 0;
-    HIP_CALL(hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, deviceIdx));
-    if (!isLargeBar)
-      printf("[WARN] Large BAR is not enabled for GPU %d in BIOS.  This may result in segfaults\n", i);
-  }
-#endif
-}
-
-void ParseMemType(EnvVars const& ev, std::string const& token,
-                  std::vector<MemType>& memTypes, std::vector<int>& memIndices)
-{
-  char typeChar;
-  int offset = 0, devIndex, inc;
-  bool found = false;
-
-  memTypes.clear();
-  memIndices.clear();
-  while (sscanf(token.c_str() + offset, " %c %d%n", &typeChar, &devIndex, &inc) == 2)
-  {
-    offset += inc;
-    MemType memType = CharToMemType(typeChar);
-
-    if (IsCpuType(memType) && (devIndex < 0 || devIndex >= ev.numCpuDevices))
-    {
-      printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", ev.numCpuDevices-1, devIndex);
-      exit(1);
-    }
-    if (IsGpuType(memType) && (devIndex < 0 || devIndex >= ev.numGpuDevices))
-    {
-      printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", ev.numGpuDevices-1, devIndex);
-      exit(1);
-    }
-
-    found = true;
-    if (memType != MEM_NULL)
-    {
-      memTypes.push_back(memType);
-      memIndices.push_back(devIndex);
-    }
-  }
-  if (!found)
-  {
-    printf("[ERROR] Unable to parse memory type token %s.  Expected one of %s followed by an index\n",
-           token.c_str(), MemTypeStr);
-    exit(1);
-  }
-}
-
-void ParseExeType(EnvVars const& ev, std::string const& token,
-                  ExeType &exeType, int& exeIndex, int& exeSubIndex)
-{
-  char typeChar;
-  exeSubIndex = -1;
-  int numTokensParsed = sscanf(token.c_str(), " %c%d.%d", &typeChar, &exeIndex, &exeSubIndex);
-  if (numTokensParsed < 2)
-  {
-    printf("[ERROR] Unable to parse valid executor token (%s).  Exepected one of %s followed by an index\n",
-           token.c_str(), ExeTypeStr);
-    exit(1);
-  }
-  exeType = CharToExeType(typeChar);
-
-  if (IsCpuType(exeType) && (exeIndex < 0 || exeIndex >= ev.numCpuDevices))
-  {
-    printf("[ERROR] CPU index must be between 0 and %d (instead of %d)\n", ev.numCpuDevices-1, exeIndex);
-    exit(1);
-  }
-  if (IsGpuType(exeType) && (exeIndex < 0 || exeIndex >= ev.numGpuDevices))
-  {
-    printf("[ERROR] GPU index must be between 0 and %d (instead of %d)\n", ev.numGpuDevices-1, exeIndex);
-    exit(1);
-  }
-  if (exeType == EXE_GPU_GFX && exeSubIndex != -1)
-  {
-    int const idx = RemappedIndex(exeIndex, false);
-    if (ev.xccIdsPerDevice[idx].count(exeSubIndex) == 0)
-    {
-      printf("[ERROR] GPU %d does not have subIndex %d\n", exeIndex, exeSubIndex);
-      exit(1);
-    }
-  }
-}
-
-// Helper function to parse a list of Transfer definitions
-void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transfers)
-{
-  // Replace any round brackets or '->' with spaces,
-  for (int i = 1; line[i]; i++)
-    if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
-
-  transfers.clear();
-
-  int numTransfers = 0;
-  std::istringstream iss(line);
-  iss >> numTransfers;
-  if (iss.fail()) return;
-
-  std::string exeMem;
-  std::string srcMem;
-  std::string dstMem;
-
-  // If numTransfers < 0, read 5-tuple (srcMem, exeMem, dstMem, #CUs, #Bytes)
-  // otherwise read triples (srcMem, exeMem, dstMem)
-  bool const advancedMode = (numTransfers < 0);
-  numTransfers = abs(numTransfers);
-
-  int numSubExecs;
-  if (!advancedMode)
-  {
-    iss >> numSubExecs;
-    if (numSubExecs < 0 || iss.fail())
-    {
-      printf("Parsing error: Number of blocks to use (%d) must be non-negative\n", numSubExecs);
-      exit(1);
-    }
-  }
-
-  size_t numBytes = 0;
-  for (int i = 0; i < numTransfers; i++)
-  {
-    Transfer transfer;
-    transfer.numBytes = 0;
-    transfer.numBytesActual = 0;
-    if (!advancedMode)
-    {
-      iss >> srcMem >> exeMem >> dstMem;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet\n", i+1);
-        exit(1);
-      }
-    }
-    else
-    {
-      std::string numBytesToken;
-      iss >> srcMem >> exeMem >> dstMem >> numSubExecs >> numBytesToken;
-      if (iss.fail())
-      {
-        printf("Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple\n", i+1);
-        exit(1);
-      }
-      if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1)
-      {
-        printf("Parsing error: '%s' is not a valid expression of numBytes for Transfer %d\n", numBytesToken.c_str(), i+1);
-        exit(1);
-      }
-      char units = numBytesToken.back();
-      switch (toupper(units))
-      {
-      case 'K': numBytes *= 1024; break;
-      case 'M': numBytes *= 1024*1024; break;
-      case 'G': numBytes *= 1024*1024*1024; break;
-      }
-    }
-
-    ParseMemType(ev, srcMem, transfer.srcType, transfer.srcIndex);
-    ParseMemType(ev, dstMem, transfer.dstType, transfer.dstIndex);
-    ParseExeType(ev, exeMem, transfer.exeType, transfer.exeIndex, transfer.exeSubIndex);
-
-    transfer.numSrcs = (int)transfer.srcType.size();
-    transfer.numDsts = (int)transfer.dstType.size();
-    if (transfer.numSrcs == 0 && transfer.numDsts == 0)
-    {
-      printf("[ERROR] Transfer must have at least one src or dst\n");
-      exit(1);
-    }
-
-    if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs != 1 || transfer.numDsts != 1))
-    {
-      printf("[ERROR] GPU DMA executor can only be used for single source + single dst copies\n");
-      exit(1);
-    }
-
-    transfer.numSubExecs = numSubExecs;
-    transfer.numBytes = numBytes;
-    transfers.push_back(transfer);
-  }
-}
-
-void EnablePeerAccess(int const deviceId, int const peerDeviceId)
-{
-  int canAccess;
-  HIP_CALL(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
-  if (!canAccess)
-  {
-    printf("[ERROR] Unable to enable peer access from GPU devices %d to %d\n", peerDeviceId, deviceId);
-    exit(1);
-  }
-  HIP_CALL(hipSetDevice(deviceId));
-  hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
-  if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
-  {
-    printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
-           deviceId, peerDeviceId, hipGetErrorString(error));
-    exit(1);
-  }
-}
-
-void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
-{
-  if (numBytes == 0)
-  {
-    printf("[ERROR] Unable to allocate 0 bytes\n");
-    exit(1);
-  }
-  *memPtr = nullptr;
-  if (IsCpuType(memType))
-  {
-    // Set numa policy prior to call to hipHostMalloc
-    numa_set_preferred(devIndex);
-
-    // Allocate host-pinned memory (should respect NUMA mem policy)
-    if (memType == MEM_CPU_FINE)
-    {
-#if defined (__NVCC__)
-      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
-      exit(1);
-#else
-      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
-#endif
-    }
-    else if (memType == MEM_CPU)
-    {
-#if defined (__NVCC__)
-      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
-#else
-      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
-#endif
-      {
-        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
-        exit(1);
-      }
-    }
-    else if (memType == MEM_CPU_UNPINNED)
-    {
-      *memPtr = numa_alloc_onnode(numBytes, devIndex);
-    }
-
-    // Check that the allocated pages are actually on the correct NUMA node
-    memset(*memPtr, 0, numBytes);
-
-    CheckPages((char*)*memPtr, numBytes, devIndex);
-
-    // Reset to default numa mem policy
-    numa_set_preferred(-1);
-  }
-  else if (IsGpuType(memType))
-  {
-    if (memType == MEM_GPU)
-    {
-      // Allocate GPU memory on appropriate device
-      HIP_CALL(hipSetDevice(devIndex));
-      HIP_CALL(hipMalloc((void**)memPtr, numBytes));
-    }
-    else if (memType == MEM_GPU_FINE)
-    {
-#if defined (__NVCC__)
-      printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
-      exit(1);
-#else
-      HIP_CALL(hipSetDevice(devIndex));
-      int flag = hipDeviceMallocUncached;
-      HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
-#endif
-    }
-    else if (memType == MEM_MANAGED)
-    {
-      HIP_CALL(hipSetDevice(devIndex));
-      HIP_CALL(hipMallocManaged((void**)memPtr, numBytes));
-    }
-    HIP_CALL(hipMemset(*memPtr, 0, numBytes));
-    HIP_CALL(hipDeviceSynchronize());
-  }
-  else
-  {
-    printf("[ERROR] Unsupported memory type %d\n", memType);
-    exit(1);
-  }
-}
-
-void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
-{
-  if (memType == MEM_CPU || memType == MEM_CPU_FINE)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null CPU pointer for %lu bytes.  Skipping hipHostFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipHostFree(memPtr));
-  }
-  else if (memType == MEM_CPU_UNPINNED)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null unpinned CPU pointer for %lu bytes.  Skipping numa_free\n", bytes);
-      return;
-    }
-    numa_free(memPtr, bytes);
-  }
-  else if (memType == MEM_GPU || memType == MEM_GPU_FINE)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null GPU pointer for %lu bytes. Skipping hipFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipFree(memPtr));
-  }
-  else if (memType == MEM_MANAGED)
-  {
-    if (memPtr == nullptr)
-    {
-      printf("[ERROR] Attempting to free null managed pointer for %lu bytes. Skipping hipMFree\n", bytes);
-      return;
-    }
-    HIP_CALL(hipFree(memPtr));
-  }
-}
-
-void CheckPages(char* array, size_t numBytes, int targetId)
-{
-  unsigned long const pageSize = getpagesize();
-  unsigned long const numPages = (numBytes + pageSize - 1) / pageSize;
-
-  std::vector<void *> pages(numPages);
-  std::vector<int> status(numPages);
-
-  pages[0] = array;
-  for (int i = 1; i < numPages; i++)
-  {
-    pages[i] = (char*)pages[i-1] + pageSize;
-  }
-
-  long const retCode = move_pages(0, numPages, pages.data(), NULL, status.data(), 0);
-  if (retCode)
-  {
-    printf("[ERROR] Unable to collect page info\n");
-    exit(1);
-  }
-
-  size_t mistakeCount = 0;
-  for (int i = 0; i < numPages; i++)
-  {
-    if (status[i] < 0)
-    {
-      printf("[ERROR] Unexpected page status %d for page %d\n", status[i], i);
-      exit(1);
-    }
-    if (status[i] != targetId) mistakeCount++;
-  }
-  if (mistakeCount > 0)
-  {
-    printf("[ERROR] %lu out of %lu pages for memory allocation were not on NUMA node %d\n", mistakeCount, numPages, targetId);
-    exit(1);
-  }
-}
-
-uint32_t GetId(uint32_t hwId)
-{
-#if defined(__NVCC_)
-  return hwId;
-#else
-  // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
-  int const shId = (hwId >> 12) &  1;
-  int const cuId = (hwId >>  8) & 15;
-  int const seId = (hwId >> 13) &  3;
-  return (shId << 5) + (cuId << 2) + seId;
-#endif
-}
-
-void RunTransfer(EnvVars const& ev, int const iteration,
-                 ExecutorInfo& exeInfo, int const transferIdx)
-{
-  Transfer* transfer = exeInfo.transfers[transferIdx];
-
-  if (transfer->exeType == EXE_GPU_GFX)
-  {
-    // Switch to executing GPU
-    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-    HIP_CALL(hipSetDevice(exeIndex));
-
-    hipStream_t& stream     = exeInfo.streams[transferIdx];
-    hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
-    hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
-
-    // Figure out how many threadblocks to use.
-    // In single stream mode, all the threadblocks for this GPU are launched
-    // Otherwise, just launch the threadblocks associated with this single Transfer
-    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
-    int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);
-    dim3 const gridSize(numXCCs, numBlocksToRun, 1);
-    dim3 const blockSize(ev.gfxBlockSize, 1, 1);
-
-#if defined(__NVCC__)
-    HIP_CALL(hipEventRecord(startEvent, stream));
-    GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1]
-      <<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
-    HIP_CALL(hipEventRecord(stopEvent, stream));
-#else
-    hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1],
-                          gridSize, blockSize,
-                          ev.sharedMemBytes, stream,
-                          startEvent, stopEvent,
-                          0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
-#endif
-    // Synchronize per iteration, unless in single sync mode, in which case
-    // synchronize during last warmup / last actual iteration
-    HIP_CALL(hipStreamSynchronize(stream));
-
-    if (iteration >= 0)
-    {
-      // Record GPU timing
-      float gpuDeltaMsec;
-      HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-
-      if (ev.useSingleStream)
-      {
-        // Figure out individual timings for Transfers that were all launched together
-        for (Transfer* currTransfer : exeInfo.transfers)
-        {
-          long long minStartCycle = std::numeric_limits<long long>::max();
-          long long maxStopCycle  = std::numeric_limits<long long>::min();
-
-          std::set<std::pair<int,int>> CUs;
-          for (auto subExecIdx : currTransfer->subExecIdx)
-          {
-            minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
-            maxStopCycle  = std::max(maxStopCycle,  exeInfo.subExecParamGpu[subExecIdx].stopCycle);
-            if (ev.showIterations)
-              CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
-                                        GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
-          }
-          int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
-          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
-          currTransfer->transferTime += iterationTimeMs;
-          if (ev.showIterations)
-          {
-            currTransfer->perIterationTime.push_back(iterationTimeMs);
-            currTransfer->perIterationCUs.push_back(CUs);
-          }
-        }
-        exeInfo.totalTime += gpuDeltaMsec;
-      }
-      else
-      {
-        transfer->transferTime += gpuDeltaMsec;
-        if (ev.showIterations)
-        {
-          transfer->perIterationTime.push_back(gpuDeltaMsec);
-          std::set<std::pair<int,int>> CUs;
-          for (int i = 0; i < transfer->numSubExecs; i++)
-            CUs.insert(std::make_pair(transfer->subExecParamGpuPtr[i].xccId,
-                                      GetId(transfer->subExecParamGpuPtr[i].hwId)));
-          transfer->perIterationCUs.push_back(CUs);
-        }
-      }
-    }
-  }
-  else if (transfer->exeType == EXE_GPU_DMA)
-  {
-    int const exeIndex = RemappedIndex(transfer->exeIndex, false);
-
-    int subIterations = 0;
-    if (transfer->exeSubIndex == -1 && !ev.useHsaDma) {
-      // Switch to executing GPU
-      HIP_CALL(hipSetDevice(exeIndex));
-      hipStream_t& stream     = exeInfo.streams[transferIdx];
-      hipEvent_t&  startEvent = exeInfo.startEvents[transferIdx];
-      hipEvent_t&  stopEvent  = exeInfo.stopEvents[transferIdx];
-
-      HIP_CALL(hipEventRecord(startEvent, stream));
-      do {
-        HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
-                                transfer->numBytesActual, hipMemcpyDefault,
-                                stream));
-      } while (++subIterations != ev.numSubIterations);
-      HIP_CALL(hipEventRecord(stopEvent, stream));
-      HIP_CALL(hipStreamSynchronize(stream));
-
-      // Record time based on HIP events
-      if (iteration >= 0) {
-        float gpuDeltaMsec;
-        HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-        transfer->transferTime += gpuDeltaMsec;
-        if (ev.showIterations)
-          transfer->perIterationTime.push_back(gpuDeltaMsec);
-      }
-    } else {
-#if defined(__NVCC__)
-      printf("[ERROR] CUDA does not support targeting specific DMA engines\n");
-      exit(1);
-#else
-      // Use hsa_amd_memory copy (either targeted or untargeted)
-      auto cpuStart = std::chrono::high_resolution_clock::now();
-
-      do {
-        // Atomically set signal to 1
-        HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
-        if (ev.useHsaDma) {
-          HSA_CALL(hsa_amd_memory_async_copy(transfer->dstMem[0], transfer->dstAgent,
-                                             transfer->srcMem[0], transfer->srcAgent,
-                                             transfer->numBytesActual, 0, NULL,
-                                             transfer->signal));
-        } else {
-          HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
-                                                       transfer->srcMem[0], transfer->srcAgent,
-                                                       transfer->numBytesActual, 0, NULL,
-                                                       transfer->signal,
-                                                       transfer->sdmaEngineId, true));
-        }
-        // Wait for SDMA transfer to complete
-        while(hsa_signal_wait_scacquire(transfer->signal,
-                                        HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
-                                        HSA_WAIT_STATE_ACTIVE) >= 1);
-      } while (++subIterations < ev.numSubIterations);
-
-      if (iteration >= 0) {
-        // Record GPU timing
-        auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-        double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-        transfer->transferTime += deltaMsec;
-        if (ev.showIterations)
-          transfer->perIterationTime.push_back(deltaMsec);
-      }
-#endif
-    }
-  }
-  else if (transfer->exeType == EXE_CPU) // CPU execution agent
-  {
-    // Force this thread and all child threads onto correct NUMA node
-    int const exeIndex = RemappedIndex(transfer->exeIndex, true);
-    if (numa_run_on_node(exeIndex))
-    {
-      printf("[ERROR] Unable to set CPU to NUMA node %d\n", exeIndex);
-      exit(1);
-    }
-
-    std::vector<std::thread> childThreads;
-
-    int subIteration = 0;
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    do {
-      // Launch each subExecutor in child-threads to perform memcopies
-      for (int i = 0; i < transfer->numSubExecs; ++i)
-        childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
-
-      // Wait for child-threads to finish
-      for (int i = 0; i < transfer->numSubExecs; ++i)
-        childThreads[i].join();
-      childThreads.clear();
-    } while (++subIteration != ev.numSubIterations);
-
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-
-    // Record time if not a warmup iteration
-    if (iteration >= 0)
-    {
-      double const delta = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0);
-      transfer->transferTime += delta;
-      if (ev.showIterations)
-        transfer->perIterationTime.push_back(delta);
-    }
-  }
-}
-
-void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
-{
-  ev.DisplayP2PBenchmarkEnvVars();
-
-  char const separator = ev.outputToCsv ? ',' : ' ';
-  printf("Bytes Per Direction%c%lu\n", separator, N * sizeof(float));
-
-  // Collect the number of available CPUs/GPUs on this machine
-  int const numCpus    = ev.numCpuDevices;
-  int const numGpus    = ev.numGpuDevices;
-  int const numDevices = numCpus + numGpus;
-
-  // Enable peer to peer for each GPU
-  for (int i = 0; i < numGpus; i++)
-    for (int j = 0; j < numGpus; j++)
-      if (i != j) EnablePeerAccess(i, j);
-
-  // Perform unidirectional / bidirectional
-  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++)
-  {
-    if (ev.p2pMode == 1 && isBidirectional == 1 ||
-        ev.p2pMode == 2 && isBidirectional == 0) continue;
-
-    printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
-           ev.useRemoteRead ? "Remote" : "Local",
-           ev.useRemoteRead ? "Local" : "Remote",
-           ev.useDmaCopy    ? "DMA"   : "GFX");
-
-    // Print header
-    if (isBidirectional)
-    {
-      printf("%12s", "SRC\\DST");
-    }
-    else
-    {
-      if (ev.useRemoteRead)
-        printf("%12s", "SRC\\EXE+DST");
-      else
-        printf("%12s", "SRC+EXE\\DST");
-    }
-    if (ev.outputToCsv) printf(",");
-    for (int i = 0; i < numCpus; i++)
-    {
-      printf("%7s %02d", "CPU", i);
-      if (ev.outputToCsv) printf(",");
-    }
-    if (numCpus > 0) printf("   ");
-    for (int i = 0; i < numGpus; i++)
-    {
-      printf("%7s %02d", "GPU", i);
-      if (ev.outputToCsv) printf(",");
-    }
-    printf("\n");
-
-    double avgBwSum[2][2] = {};
-    int    avgCount[2][2] = {};
-
-    ExeType const gpuExeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
-    // Loop over all possible src/dst pairs
-    for (int src = 0; src < numDevices; src++)
-    {
-      MemType const srcType  = (src < numCpus ? MEM_CPU : MEM_GPU);
-      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
-      MemType const srcTypeActual = ((ev.useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
-                                     (ev.useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
-                                                                               srcType);
-      std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
-      std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
-      std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
-      std::vector<std::vector<double>> stdDev(isBidirectional + 1);
-
-      if (src == numCpus && src != 0) printf("\n");
-      for (int dst = 0; dst < numDevices; dst++)
-      {
-        MemType const dstType  = (dst < numCpus ? MEM_CPU : MEM_GPU);
-        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpus);
-        MemType const dstTypeActual = ((ev.useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
-                                       (ev.useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
-                                                                                 dstType);
-        // Prepare Transfers
-        std::vector<Transfer> transfers(isBidirectional + 1);
-
-        // SRC -> DST
-        transfers[0].numBytes = N * sizeof(float);
-        transfers[0].srcType.push_back(srcTypeActual);
-        transfers[0].dstType.push_back(dstTypeActual);
-        transfers[0].srcIndex.push_back(srcIndex);
-        transfers[0].dstIndex.push_back(dstIndex);
-        transfers[0].numSrcs = transfers[0].numDsts = 1;
-        transfers[0].exeType = IsGpuType(ev.useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU;
-        transfers[0].exeIndex = (ev.useRemoteRead ? dstIndex : srcIndex);
-        transfers[0].exeSubIndex = -1;
-        transfers[0].numSubExecs = IsGpuType(transfers[0].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-
-        // DST -> SRC
-        if (isBidirectional)
-        {
-          transfers[1].numBytes = N * sizeof(float);
-          transfers[1].numSrcs = transfers[1].numDsts = 1;
-          transfers[1].srcType.push_back(dstTypeActual);
-          transfers[1].dstType.push_back(srcTypeActual);
-          transfers[1].srcIndex.push_back(dstIndex);
-          transfers[1].dstIndex.push_back(srcIndex);
-          transfers[1].exeType = IsGpuType(ev.useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU;
-          transfers[1].exeIndex = (ev.useRemoteRead ? srcIndex : dstIndex);
-          transfers[1].exeSubIndex = -1;
-          transfers[1].numSubExecs = IsGpuType(transfers[1].exeType) ? ev.numGpuSubExecs : ev.numCpuSubExecs;
-        }
-
-        bool skipTest = false;
-
-        // Abort if executing on NUMA node with no CPUs
-        for (int i = 0; i <= isBidirectional; i++)
-        {
-          if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
-          {
-            skipTest = true;
-            break;
-          }
-
-#if defined(__NVCC__)
-          // NVIDIA platform cannot access GPU memory directly from CPU executors
-          if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
-          {
-            skipTest = true;
-            break;
-          }
-#endif
-        }
-
-        if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;
-
-        if (!skipTest)
-        {
-          ExecuteTransfers(ev, 0, N, transfers, false);
-
-          for (int dir = 0; dir <= isBidirectional; dir++)
-          {
-            double const avgTime = transfers[dir].transferTime;
-            double const avgBw   = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f;
-            avgBandwidth[dir].push_back(avgBw);
-
-            if (!(srcType == dstType && srcIndex == dstIndex))
-            {
-              avgBwSum[srcType][dstType] += avgBw;
-              avgCount[srcType][dstType]++;
-            }
-
-            if (ev.showIterations)
-            {
-              double minTime = transfers[dir].perIterationTime[0];
-              double maxTime = transfers[dir].perIterationTime[0];
-              double varSum  = 0;
-              for (int i = 0; i < transfers[dir].perIterationTime.size(); i++)
-              {
-                minTime = std::min(minTime, transfers[dir].perIterationTime[i]);
-                maxTime = std::max(maxTime, transfers[dir].perIterationTime[i]);
-                double const bw  = (transfers[dir].numBytesActual / 1.0E9) / transfers[dir].perIterationTime[i] * 1000.0f;
-                double const delta = (avgBw - bw);
-                varSum += delta * delta;
-              }
-              double const minBw = (transfers[dir].numBytesActual / 1.0E9) / maxTime * 1000.0f;
-              double const maxBw = (transfers[dir].numBytesActual / 1.0E9) / minTime * 1000.0f;
-              double const stdev = sqrt(varSum / transfers[dir].perIterationTime.size());
-              minBandwidth[dir].push_back(minBw);
-              maxBandwidth[dir].push_back(maxBw);
-              stdDev[dir].push_back(stdev);
-            }
-          }
-        }
-        else
-        {
-          for (int dir = 0; dir <= isBidirectional; dir++)
-          {
-            avgBandwidth[dir].push_back(0);
-            minBandwidth[dir].push_back(0);
-            maxBandwidth[dir].push_back(0);
-            stdDev[dir].push_back(-1.0);
-          }
-        }
-      }
-
-      for (int dir = 0; dir <= isBidirectional; dir++)
-      {
-        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
-        if (ev.outputToCsv) printf(",");
-
-        for (int dst = 0; dst < numDevices; dst++)
-        {
-          if (dst == numCpus && dst != 0) printf("   ");
-          double const avgBw = avgBandwidth[dir][dst];
-
-          if (avgBw == 0.0)
-            printf("%10s", "N/A");
-          else
-            printf("%10.2f", avgBw);
-          if (ev.outputToCsv) printf(",");
-        }
-        printf("\n");
-
-        if (ev.showIterations)
-        {
-          // minBw
-          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
-          if (ev.outputToCsv) printf(",");
-          for (int i = 0; i < numDevices; i++)
-          {
-            double const minBw = minBandwidth[dir][i];
-            if (i == numCpus && i != 0) printf("   ");
-            if (minBw == 0.0)
-              printf("%10s", "N/A");
-            else
-              printf("%10.2f", minBw);
-            if (ev.outputToCsv) printf(",");
-          }
-          printf("\n");
-
-          // maxBw
-          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
-          if (ev.outputToCsv) printf(",");
-          for (int i = 0; i < numDevices; i++)
-          {
-            double const maxBw = maxBandwidth[dir][i];
-            if (i == numCpus && i != 0) printf("   ");
-            if (maxBw == 0.0)
-              printf("%10s", "N/A");
-            else
-              printf("%10.2f", maxBw);
-            if (ev.outputToCsv) printf(",");
-          }
-          printf("\n");
-
-          // stddev
-          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
-          if (ev.outputToCsv) printf(",");
-          for (int i = 0; i < numDevices; i++)
-          {
-            double const sd = stdDev[dir][i];
-            if (i == numCpus && i != 0) printf("   ");
-            if (sd == -1.0)
-              printf("%10s", "N/A");
-            else
-              printf("%10.2f", sd);
-            if (ev.outputToCsv) printf(",");
-          }
-          printf("\n");
-        }
-        fflush(stdout);
-      }
-
-      if (isBidirectional)
-      {
-        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
-        if (ev.outputToCsv) printf(",");
-        for (int dst = 0; dst < numDevices; dst++)
-        {
-          double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
-          if (dst == numCpus && dst != 0) printf("   ");
-          if (sumBw == 0.0)
-            printf("%10s", "N/A");
-          else
-            printf("%10.2f", sumBw);
-          if (ev.outputToCsv) printf(",");
-        }
-        printf("\n");
-        if (src < numDevices - 1) printf("\n");
-      }
-    }
-
-    if (!ev.outputToCsv)
-    {
-      printf("                         ");
-      for (int srcType : {MEM_CPU, MEM_GPU})
-        for (int dstType : {MEM_CPU, MEM_GPU})
-          printf("  %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
-      printf("\n");
-
-      printf("Averages (During %s):",  isBidirectional ? " BiDir" : "UniDir");
-      for (int srcType : {MEM_CPU, MEM_GPU})
-        for (int dstType : {MEM_CPU, MEM_GPU})
-        {
-          if (avgCount[srcType][dstType])
-            printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
-          else
-            printf("%10s", "N/A");
-        }
-      printf("\n\n");
-    }
-  }
-}
-
-void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int const maxSubExecs)
-{
-  ev.DisplayEnvVars();
-
-  // Collect the number of available CPUs/GPUs on this machine
-  int const numCpus    = ev.numCpuDevices;
-  int const numGpus    = ev.numGpuDevices;
-  int const numDevices = numCpus + numGpus;
-
-  // Enable peer to peer for each GPU
-  for (int i = 0; i < numGpus; i++)
-    for (int j = 0; j < numGpus; j++)
-      if (i != j) EnablePeerAccess(i, j);
-
-  char separator = (ev.outputToCsv ? ',' : ' ');
-
-  std::vector<Transfer> transfers(1);
-  Transfer& t = transfers[0];
-  t.numBytes = N * sizeof(float);
-  t.numSrcs  = 1;
-  t.numDsts  = 1;
-  t.exeType  = EXE_GPU_GFX;
-  t.exeIndex = exeIndex;
-  t.exeSubIndex = -1;
-  t.srcType.resize(1, MEM_GPU);
-  t.dstType.resize(1, MEM_GPU);
-  t.srcIndex.resize(1);
-  t.dstIndex.resize(1);
-
-  printf("GPU-GFX Scaling benchmark:\n");
-  printf("==========================\n");
-  printf("- Copying %lu bytes from GPU %d to other devices\n", t.numBytes, exeIndex);
-  printf("- All numbers reported as GB/sec\n\n");
-
-  printf("NumCUs");
-  for (int i = 0; i < numDevices; i++)
-    printf("%c  %s%02d     ", separator, i < numCpus ? "CPU" : "GPU", i < numCpus ? i : i - numCpus);
-  printf("\n");
-
-  std::vector<std::pair<double, int>> bestResult(numDevices);
-  for (int numSubExec = 1; numSubExec <= maxSubExecs; numSubExec++)
-  {
-    t.numSubExecs = numSubExec;
-    printf("%4d  ", numSubExec);
-
-    for (int i = 0; i < numDevices; i++)
-    {
-      t.dstType[0]  = i < numCpus ? MEM_CPU : MEM_GPU;
-      t.dstIndex[0] = i < numCpus ? i : i - numCpus;
-
-      ExecuteTransfers(ev, 0, N, transfers, false);
-      printf("%c%7.2f     ", separator, t.transferBandwidth);
-
-      if (t.transferBandwidth > bestResult[i].first)
-      {
-        bestResult[i].first  = t.transferBandwidth;
-        bestResult[i].second = numSubExec;
-      }
-    }
-    printf("\n");
-  }
-
-  printf(" Best ");
-  for (int i = 0; i < numDevices; i++)
-  {
-    printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
-  }
-  printf("\n");
-}
-
-void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs)
-{
-  ev.DisplayA2AEnvVars();
-
-  // Collect the number of GPU devices to use
-  int const numGpus = ev.numGpuDevices;
-
-  // Enable peer to peer for each GPU
-  for (int i = 0; i < numGpus; i++)
-    for (int j = 0; j < numGpus; j++)
-      if (i != j) EnablePeerAccess(i, j);
-
-  char separator = (ev.outputToCsv ? ',' : ' ');
-
-  Transfer transfer;
-  transfer.numBytes    = numBytesPerTransfer;
-  transfer.numSubExecs = numSubExecs;
-  transfer.numSrcs     = ev.a2aMode == 2 ? 0 : 1;
-  transfer.numDsts     = ev.a2aMode == 1 ? 0 : 1;
-  transfer.exeType     = (ev.useDmaCopy && transfer.numSrcs == 1 && transfer.numDsts == 1) ? EXE_GPU_DMA : EXE_GPU_GFX;
-  transfer.exeSubIndex = -1;
-  transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-  transfer.dstType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-  transfer.srcIndex.resize(1);
-  transfer.dstIndex.resize(1);
-
-  std::vector<Transfer> transfers;
-  for (int i = 0; i < numGpus; i++)
-  {
-    transfer.srcIndex[0] = i;
-    for (int j = 0; j < numGpus; j++)
-    {
-      transfer.dstIndex[0] = j;
-      transfer.exeIndex    = (ev.useRemoteRead ? j : i);
-
-      if (ev.a2aDirect)
-      {
-        if (i == j) continue;
-
-#if !defined(__NVCC__)
-        uint32_t linkType, hopCount;
-        HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false),
-                                              RemappedIndex(j, false),
-                                              &linkType, &hopCount));
-        if (hopCount != 1) continue;
-#endif
-      }
-      transfers.push_back(transfer);
-    }
-  }
-
-  printf("GPU-GFX All-To-All benchmark:\n");
-  printf("==========================\n");
-  printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
-         numBytesPerTransfer, ev.a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
-  if (transfers.size() == 0) return;
-
-  double totalBandwidthCpu = 0;
-  ExecuteTransfers(ev, 0, numBytesPerTransfer / sizeof(float), transfers, !ev.hideEnv, &totalBandwidthCpu);
-
-  printf("\nSummary:\n");
-  printf("==========================================================\n");
-  printf("SRC\\DST ");
-  for (int dst = 0; dst < numGpus; dst++)
-    printf("%cGPU %02d    ", separator, dst);
-  printf("   %cSTotal     %cActual\n", separator, separator);
-
-  std::map<std::pair<int, int>, int> reIndex;
-  for (int i = 0; i < transfers.size(); i++)
-  {
-    Transfer const& t = transfers[i];
-    reIndex[std::make_pair(t.srcIndex[0], t.dstIndex[0])] = i;
-  }
-
-  double totalBandwidthGpu = 0.0;
-  double minExecutorBandwidth = std::numeric_limits<double>::max();
-  double maxExecutorBandwidth = 0.0;
-  std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
-  for (int src = 0; src < numGpus; src++)
-  {
-    double rowTotalBandwidth = 0;
-    double executorBandwidth = 0;
-    printf("GPU %02d", src);
-    for (int dst = 0; dst < numGpus; dst++)
-    {
-      if (reIndex.count(std::make_pair(src, dst)))
-      {
-        Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]];
-        colTotalBandwidth[dst]  += transfer.transferBandwidth;
-        rowTotalBandwidth       += transfer.transferBandwidth;
-        totalBandwidthGpu       += transfer.transferBandwidth;
-        executorBandwidth        = std::max(executorBandwidth, transfer.executorBandwidth);
-        printf("%c%8.3f  ", separator, transfer.transferBandwidth);
-      }
-      else
-      {
-        printf("%c%8s  ", separator, "N/A");
-      }
-    }
-    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
-    minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
-    maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
-    colTotalBandwidth[numGpus] += rowTotalBandwidth;
-  }
-  printf("\nRTotal");
-  for (int dst = 0; dst < numGpus; dst++)
-  {
-    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
-  }
-  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus],
-         separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
-  printf("\n");
-
-  printf("Average   bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
-  printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
-  printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", totalBandwidthCpu);
-}
-
-void Transfer::PrepareSubExecParams(EnvVars const& ev)
-{
-  // Each subExecutor needs to know src/dst pointers and how many elements to transfer
-  // Figure out the sub-array each subExecutor works on for this Transfer
-  // - Partition N as evenly as possible, but try to keep subarray sizes as multiples of BLOCK_BYTES bytes,
-  //   except the very last one, for alignment reasons
-  size_t const N              = this->numBytesActual / sizeof(float);
-  int    const initOffset     = ev.byteOffset / sizeof(float);
-  int    const targetMultiple = ev.blockBytes / sizeof(float);
-
-  // In some cases, there may not be enough data for all subExectors
-  int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, (size_t)this->numSubExecs);
-  this->subExecParam.clear();
-  this->subExecParam.resize(this->numSubExecs);
-
-  size_t assigned = 0;
-  for (int i = 0; i < this->numSubExecs; ++i)
-  {
-    SubExecParam& p  = this->subExecParam[i];
-    p.numSrcs        = this->numSrcs;
-    p.numDsts        = this->numDsts;
-
-    if (ev.gfxSingleTeam && this->exeType == EXE_GPU_GFX)
-    {
-      p.N           = N;
-      p.teamSize    = this->numSubExecs;
-      p.teamIdx     = i;
-      for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset;
-      for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset;
-    }
-    else
-    {
-      int    const subExecLeft = std::max(0, maxSubExecToUse - i);
-      size_t const leftover    = N - assigned;
-      size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;
-
-      p.N           = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
-      p.teamSize    = 1;
-      p.teamIdx     = 0;
-      for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset + assigned;
-      for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset + assigned;
-
-      assigned += p.N;
-    }
-
-    p.preferredXccId = -1;
-    if (ev.useXccFilter && this->exeType == EXE_GPU_GFX)
-    {
-      std::uniform_int_distribution<int> distribution(0, ev.xccIdsPerDevice[this->exeIndex].size() - 1);
-
-      // Use this tranfer's executor subIndex if set
-      if (this->exeSubIndex != -1)
-      {
-        p.preferredXccId = this->exeSubIndex;
-      }
-      else if (this->numDsts >= 1 && IsGpuType(this->dstType[0]))
-      {
-        p.preferredXccId = ev.prefXccTable[this->exeIndex][this->dstIndex[0]];
-      }
-
-      if (p.preferredXccId == -1)
-      {
-        p.preferredXccId = distribution(*ev.generator);
-      }
-    }
-
-    if (ev.enableDebug)
-    {
-      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
-             this->transferIndex, i, p.N, assigned, assigned + p.N);
-    }
-
-    p.startCycle = 0;
-    p.stopCycle  = 0;
-  }
-
-  this->transferTime = 0.0;
-  this->perIterationTime.clear();
-}
-
-void Transfer::PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx)
-{
-  size_t N = buffer.size();
-  if (bufferIdx >= 0)
-  {
-    size_t patternLen = ev.fillPattern.size();
-    if (patternLen > 0)
-    {
-      for (size_t i = 0; i < N; ++i)
-        buffer[i] = ev.fillPattern[i % patternLen];
-    }
-    else
-    {
-      for (size_t i = 0; i < N; ++i)
-        buffer[i] = PrepSrcValue(bufferIdx, i);
-    }
-  }
-  else // Destination buffer
-  {
-    if (this->numSrcs == 0)
-    {
-      // Note: 0x75757575 = 13323083.0
-      memset(buffer.data(), MEMSET_CHAR, N * sizeof(float));
-    }
-    else
-    {
-      PrepareReference(ev, buffer, 0);
-
-      if (this->numSrcs > 1)
-      {
-        std::vector<float> temp(N);
-        for (int srcIdx = 1; srcIdx < this->numSrcs; ++srcIdx)
-        {
-          PrepareReference(ev, temp, srcIdx);
-          for (int i = 0; i < N; ++i)
-          {
-            buffer[i] += temp[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-bool Transfer::PrepareSrc(EnvVars const& ev)
-{
-  if (this->numSrcs == 0) return true;
-  size_t const N = this->numBytesActual / sizeof(float);
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  std::vector<float> reference(N);
-  for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
-  {
-    float* srcPtr = this->srcMem[srcIdx] + initOffset;
-    PrepareReference(ev, reference, srcIdx);
-
-    // Initialize source memory array with reference pattern
-    if (IsGpuType(this->srcType[srcIdx]))
-    {
-      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
-      HIP_CALL(hipSetDevice(deviceIdx));
-      if (ev.usePrepSrcKernel)
-        PrepSrcDataKernel<<<32, ev.gfxBlockSize>>>(srcPtr, N, srcIdx);
-      else
-        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-    }
-    else if (IsCpuType(this->srcType[srcIdx]))
-    {
-      memcpy(srcPtr, reference.data(), this->numBytesActual);
-    }
-
-    // Perform check just to make sure that data has been copied properly
-    float* srcCheckPtr = srcPtr;
-    std::vector<float> srcCopy(N);
-    if (IsGpuType(this->srcType[srcIdx]))
-    {
-      if (!ev.validateDirect)
-      {
-        HIP_CALL(hipMemcpy(srcCopy.data(), srcPtr, this->numBytesActual, hipMemcpyDefault));
-        HIP_CALL(hipDeviceSynchronize());
-        srcCheckPtr = srcCopy.data();
-      }
-    }
-
-    for (size_t i = 0; i < N; ++i)
-    {
-      if (reference[i] != srcCheckPtr[i])
-      {
-        printf("\n[ERROR] Unexpected mismatch at index %lu of source array %d:\n", i, srcIdx);
-#if !defined(__NVCC__)
-        float const val = this->srcMem[srcIdx][initOffset + i];
-        printf("[ERROR] SRC %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-               srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i], val, *(unsigned int*)&val);
-#else
-        printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcCheckPtr[i], *(unsigned int*)&srcCheckPtr[i]);
-#endif
-        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
-        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
-               this->transferIndex,
-               this->SrcToStr().c_str(),
-               ExeTypeStr[this->exeType], this->exeIndex,
-               this->numSubExecs,
-               this->DstToStr().c_str());
-        printf("[ERROR] Possible cause is misconfigured IOMMU (AMD Instinct cards require amd_iommu=on and iommu=pt)\n");
-        printf("[ERROR] Please see https://community.amd.com/t5/knowledge-base/iommu-advisory-for-amd-instinct/ta-p/484601 for more details\n");
-        if (!ev.continueOnError)
-          exit(1);
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-void Transfer::ValidateDst(EnvVars const& ev)
-{
-  if (this->numDsts == 0) return;
-  size_t const N = this->numBytesActual / sizeof(float);
-  int const initOffset = ev.byteOffset / sizeof(float);
-
-  std::vector<float> reference(N);
-  PrepareReference(ev, reference, -1);
-
-  std::vector<float> hostBuffer(N);
-  for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
-  {
-    float* output;
-    if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
-    {
-      output = this->dstMem[dstIdx] + initOffset;
-    }
-    else
-    {
-      int const deviceIdx = RemappedIndex(this->dstIndex[dstIdx], false);
-      HIP_CALL(hipSetDevice(deviceIdx));
-      HIP_CALL(hipMemcpy(hostBuffer.data(), this->dstMem[dstIdx] + initOffset, this->numBytesActual, hipMemcpyDefault));
-      HIP_CALL(hipDeviceSynchronize());
-      output = hostBuffer.data();
-    }
-
-    for (size_t i = 0; i < N; ++i)
-    {
-      if (reference[i] != output[i])
-      {
-        printf("\n[ERROR] Unexpected mismatch at index %lu of destination array %d:\n", i, dstIdx);
-        for (int srcIdx = 0; srcIdx < this->numSrcs; ++srcIdx)
-        {
-          float srcVal;
-          HIP_CALL(hipMemcpy(&srcVal, this->srcMem[srcIdx] + initOffset + i, sizeof(float), hipMemcpyDefault));
-#if !defined(__NVCC__)
-          float val = this->srcMem[srcIdx][initOffset + i];
-          printf("[ERROR] SRC %02dD  value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-                 srcIdx, srcVal, *(unsigned int*)&srcVal, val, *(unsigned int*)&val);
-#else
-          printf("[ERROR] SRC %02d   value: %10.5f [%08X]\n", srcIdx, srcVal, *(unsigned int*)&srcVal);
-#endif
-        }
-        printf("[ERROR] EXPECTED value: %10.5f [%08X]\n", reference[i], *(unsigned int*)&reference[i]);
-#if !defined(__NVCC__)
-        float dstVal = this->dstMem[dstIdx][initOffset + i];
-        printf("[ERROR] DST %02d   value: %10.5f [%08X] Direct: %10.5f [%08X]\n",
-               dstIdx, output[i], *(unsigned int*)&output[i], dstVal, *(unsigned int*)&dstVal);
-#else
-        printf("[ERROR] DST %02d   value: %10.5f [%08X]\n", dstIdx, output[i], *(unsigned int*)&output[i]);
-#endif
-        printf("[ERROR] Failed Transfer details: #%d: %s -> [%c%d:%d] -> %s\n",
-               this->transferIndex,
-               this->SrcToStr().c_str(),
-               ExeTypeStr[this->exeType], this->exeIndex,
-               this->numSubExecs,
-               this->DstToStr().c_str());
-        if (!ev.continueOnError)
-          exit(1);
-        else
-          break;
-      }
-    }
-  }
-}
-
-std::string Transfer::SrcToStr() const
-{
-  if (numSrcs == 0) return "N";
-  std::stringstream ss;
-  for (int i = 0; i < numSrcs; ++i)
-    ss << MemTypeStr[srcType[i]] << srcIndex[i];
-  return ss.str();
-}
-
-std::string Transfer::DstToStr() const
-{
-  if (numDsts == 0) return "N";
-  std::stringstream ss;
-  for (int i = 0; i < numDsts; ++i)
-    ss << MemTypeStr[dstType[i]] << dstIndex[i];
-  return ss.str();
-}
-
-void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
-{
-  char memType = ev.useFineGrain ? 'F' : 'G';
-  printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
-  printf("       | Local Read  | Local Write | Local Copy  | Remote Read | Remote Write| Remote Copy |\n");
-  printf("  #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
-         memType, localIdx, localIdx,
-         localIdx, memType, localIdx,
-         memType, localIdx, localIdx, memType, localIdx,
-         memType, remoteIdx, localIdx,
-         localIdx, memType, remoteIdx,
-         memType, localIdx, localIdx, memType, remoteIdx);
-  printf("|------|-------------|-------------|-------------|-------------|-------------|-------------|\n");
-
-  std::vector<Transfer> transfers(1);
-  Transfer& t   = transfers[0];
-  t.exeType     = EXE_GPU_GFX;
-  t.exeIndex    = localIdx;
-  t.exeSubIndex = -1;
-  t.numBytes    = numBytesPerTransfer;
-
-  for (int numCUs = 1; numCUs <= maxSubExecs; numCUs++)
-  {
-    t.numSubExecs = numCUs;
-
-    // Local Read
-    t.numSrcs = 1;
-    t.numDsts = 0;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.srcIndex[0] = localIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const localRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Local Write
-    t.numSrcs = 0;
-    t.numDsts = 1;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.dstIndex[0] = localIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const localWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Local Copy
-    t.numSrcs = 1;
-    t.numDsts = 1;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.srcIndex[0] = localIdx;
-    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.dstIndex[0] = localIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const localCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Remote Read
-    t.numSrcs = 1;
-    t.numDsts = 0;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.srcIndex[0] = remoteIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const remoteRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Remote Write
-    t.numSrcs = 0;
-    t.numDsts = 1;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.dstIndex[0] = remoteIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const remoteWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    // Remote Copy
-    t.numSrcs = 1;
-    t.numDsts = 1;
-    t.srcType.resize(t.numSrcs);
-    t.dstType.resize(t.numDsts);
-    t.srcIndex.resize(t.numSrcs);
-    t.dstIndex.resize(t.numDsts);
-    t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.srcIndex[0] = localIdx;
-    t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-    t.dstIndex[0] = remoteIdx;
-    ExecuteTransfers(ev, 0, 0, transfers, false);
-    double const remoteCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
-
-    printf("   %3d   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f  \n",
-           numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
-  }
-}
-
-void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-{
-  printf("Bytes to %s: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n",
-         ev.useRemoteRead ? "read" : "write", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
-
-  char sep = (ev.outputToCsv ? ',' : ' ');
-
-  for (int i = 0; i < ev.numGpuDevices; i++)
-  {
-    if (i == srcIdx) continue;
-    printf("   GPU %-3d  %c", i, sep);
-  }
-  printf("\n");
-  if (!ev.outputToCsv)
-  {
-    for (int i = 0; i < ev.numGpuDevices-1; i++)
-    {
-      printf("-------------");
-    }
-    printf("\n");
-  }
-
-  for (int p = minGpus; p <= maxGpus; p++)
-  {
-    for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
-    {
-      if (bitmask & (1<<srcIdx)) continue;
-      if (__builtin_popcount(bitmask) == p)
-      {
-        std::vector<Transfer> transfers;
-        for (int i = 0; i < ev.numGpuDevices; i++)
-        {
-          if (bitmask & (1<<i))
-          {
-            Transfer t;
-            t.exeType     = EXE_GPU_GFX;
-            t.exeSubIndex = -1;
-            t.numSubExecs = numSubExecs;
-            t.numBytes    = numBytesPerTransfer;
-
-            if (ev.useRemoteRead)
-            {
-              t.numSrcs  = 1;
-              t.numDsts  = 0;
-              t.exeIndex = i;
-              t.srcType.resize(1);
-              t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-              t.srcIndex.resize(1);
-              t.srcIndex[0] = srcIdx;
-            }
-            else
-            {
-              t.numSrcs     = 0;
-              t.numDsts     = 1;
-              t.exeIndex    = srcIdx;
-              t.dstType.resize(1);
-              t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-              t.dstIndex.resize(1);
-              t.dstIndex[0] = i;
-            }
-            transfers.push_back(t);
-          }
-        }
-        ExecuteTransfers(ev, 0, 0, transfers, false);
-
-        int counter = 0;
-        for (int i = 0; i < ev.numGpuDevices; i++)
-        {
-          if (bitmask & (1<<i))
-            printf("  %8.3f  %c", transfers[counter++].transferBandwidth, sep);
-          else if (i != srcIdx)
-            printf("            %c", sep);
-        }
-
-        printf(" %d %d", p, numSubExecs);
-        for (auto i = 0; i < transfers.size(); i++)
-        {
-          printf(" (%s %c%d %s)",
-                 transfers[i].SrcToStr().c_str(),
-                 ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
-                 transfers[i].DstToStr().c_str());
-        }
-        printf("\n");
-      }
-    }
-  }
-}
-
-void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
-{
-  if (ev.useDmaCopy)
-    printf("Bytes to copy: %lu from GPU %d using DMA [Sweeping %d to %d parallel writes]\n",
-           numBytesPerTransfer, srcIdx, minGpus, maxGpus);
-  else
-    printf("Bytes to copy: %lu from GPU %d using GFX (%d CUs) [Sweeping %d to %d parallel writes]\n",
-           numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
-
-  char sep = (ev.outputToCsv ? ',' : ' ');
-
-  for (int i = 0; i < ev.numGpuDevices; i++)
-  {
-    if (i == srcIdx) continue;
-    printf("   GPU %-3d  %c", i, sep);
-  }
-  printf("\n");
-  if (!ev.outputToCsv)
-  {
-    for (int i = 0; i < ev.numGpuDevices-1; i++)
-    {
-      printf("-------------");
-    }
-    printf("\n");
-  }
-
-  for (int p = minGpus; p <= maxGpus; p++)
-  {
-    for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
-    {
-      if (bitmask & (1<<srcIdx)) continue;
-      if (__builtin_popcount(bitmask) == p)
-      {
-        std::vector<Transfer> transfers;
-        for (int i = 0; i < ev.numGpuDevices; i++)
-        {
-          if (bitmask & (1<<i))
-          {
-            Transfer t;
-            t.exeType     = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
-            t.exeSubIndex = -1;
-            t.numSubExecs = ev.useDmaCopy ? 1 : numSubExecs;
-            t.numBytes    = numBytesPerTransfer;
-
-            t.numSrcs     = 1;
-            t.numDsts     = 1;
-            t.exeIndex    = srcIdx;
-            t.srcType.resize(1);
-            t.srcType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-            t.srcIndex.resize(1);
-            t.srcIndex[0] = srcIdx;
-            t.dstType.resize(1);
-            t.dstType[0]  = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
-            t.dstIndex.resize(1);
-            t.dstIndex[0] = i;
-
-            transfers.push_back(t);
-          }
-        }
-        ExecuteTransfers(ev, 0, 0, transfers, false);
-
-        int counter = 0;
-        for (int i = 0; i < ev.numGpuDevices; i++)
-        {
-          if (bitmask & (1<<i))
-            printf("  %8.3f  %c", transfers[counter++].transferBandwidth, sep);
-          else if (i != srcIdx)
-            printf("            %c", sep);
-        }
-
-        printf(" %d %d", p, numSubExecs);
-        for (auto i = 0; i < transfers.size(); i++)
-        {
-          printf(" (%s %c%d %s)",
-                 transfers[i].SrcToStr().c_str(),
-                 ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
-                 transfers[i].DstToStr().c_str());
-        }
-        printf("\n");
-      }
-    }
-  }
-}
-
-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
-{
-  ev.DisplaySweepEnvVars();
-
-  // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
-  std::vector<std::pair<ExeType, int>> exeList;
-  for (auto exe : ev.sweepExe)
-  {
-    ExeType const exeType = CharToExeType(exe);
-    if (IsGpuType(exeType))
-    {
-      for (int exeIndex = 0; exeIndex < ev.numGpuDevices; ++exeIndex)
-        exeList.push_back(std::make_pair(exeType, exeIndex));
-    }
-    else if (IsCpuType(exeType))
-    {
-      for (int exeIndex = 0; exeIndex < ev.numCpuDevices; ++exeIndex)
-      {
-        // Skip NUMA nodes that have no CPUs (e.g. CXL)
-        if (ev.numCpusPerNuma[exeIndex] == 0) continue;
-        exeList.push_back(std::make_pair(exeType, exeIndex));
-      }
-    }
-  }
-  int numExes = exeList.size();
-
-  std::vector<std::pair<MemType, int>> srcList;
-  for (auto src : ev.sweepSrc)
-  {
-    MemType const srcType = CharToMemType(src);
-    int const numDevices = IsGpuType(srcType) ? ev.numGpuDevices : ev.numCpuDevices;
-
-    for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
-      srcList.push_back(std::make_pair(srcType, srcIndex));
-  }
-  int numSrcs = srcList.size();
-
-
-  std::vector<std::pair<MemType, int>> dstList;
-  for (auto dst : ev.sweepDst)
-  {
-    MemType const dstType = CharToMemType(dst);
-    int const numDevices = IsGpuType(dstType) ? ev.numGpuDevices : ev.numCpuDevices;
-
-    for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
-      dstList.push_back(std::make_pair(dstType, dstIndex));
-  }
-  int numDsts = dstList.size();
-
-  // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
-  struct TransferInfo
-  {
-    MemType srcType; int srcIndex;
-    ExeType exeType; int exeIndex;
-    MemType dstType; int dstIndex;
-  };
-
-  // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
-  bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
-
-  std::vector<TransferInfo> possibleTransfers;
-  TransferInfo tinfo;
-  for (int i = 0; i < numExes; ++i)
-  {
-    // Skip CPU executors if XGMI link must be used
-    if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
-    tinfo.exeType  = exeList[i].first;
-    tinfo.exeIndex = exeList[i].second;
-
-    bool isXgmiSrc  = false;
-    int  numHopsSrc = 0;
-    for (int j = 0; j < numSrcs; ++j)
-    {
-      if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
-      {
-        if (exeList[i].second != srcList[j].second)
-        {
-#if defined(__NVCC__)
-          isXgmiSrc = false;
-#else
-          uint32_t exeToSrcLinkType, exeToSrcHopCount;
-          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
-                                                RemappedIndex(srcList[j].second, false),
-                                                &exeToSrcLinkType,
-                                                &exeToSrcHopCount));
-          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
-#endif
-        }
-        else
-        {
-          isXgmiSrc = true;
-          numHopsSrc = 0;
-        }
-
-        // Skip this SRC if it is not XGMI but only XGMI links may be used
-        if (useXgmiOnly && !isXgmiSrc) continue;
-
-        // Skip this SRC if XGMI distance is already past limit
-        if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
-      }
-      else if (useXgmiOnly) continue;
-
-      tinfo.srcType  = srcList[j].first;
-      tinfo.srcIndex = srcList[j].second;
-
-      bool isXgmiDst = false;
-      int  numHopsDst = 0;
-      for (int k = 0; k < numDsts; ++k)
-      {
-        if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
-        {
-          if (exeList[i].second != dstList[k].second)
-          {
-#if defined(__NVCC__)
-            isXgmiSrc = false;
-#else
-            uint32_t exeToDstLinkType, exeToDstHopCount;
-            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
-                                                  RemappedIndex(dstList[k].second, false),
-                                                  &exeToDstLinkType,
-                                                  &exeToDstHopCount));
-            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
-            if (isXgmiDst) numHopsDst = exeToDstHopCount;
-#endif
-          }
-          else
-          {
-            isXgmiDst = true;
-            numHopsDst = 0;
-          }
-        }
-
-        // Skip this DST if it is not XGMI but only XGMI links may be used
-        if (useXgmiOnly && !isXgmiDst) continue;
-
-        // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
-        if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
-
-        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
-        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
-
-#if defined(__NVCC__)
-        // Skip CPU executors on GPU memory on NVIDIA platform
-        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
-          continue;
-#endif
-
-        tinfo.dstType  = dstList[k].first;
-        tinfo.dstIndex = dstList[k].second;
-
-        possibleTransfers.push_back(tinfo);
-      }
-    }
-  }
-
-  int const numPossible = (int)possibleTransfers.size();
-  int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
-
-  if (ev.sweepMin > numPossible)
-  {
-    printf("No valid test configurations exist\n");
-    return;
-  }
-
-  if (ev.outputToCsv)
-  {
-    printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
-           "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
-  }
-
-  int numTestsRun = 0;
-  int M = ev.sweepMin;
-  std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
-  std::uniform_int_distribution<int> distribution(ev.sweepMin, maxParallelTransfers);
-
-  // Log sweep to configuration file
-  FILE *fp = fopen("lastSweep.cfg", "w");
-  if (!fp)
-  {
-    printf("[ERROR] Unable to open lastSweep.cfg.  Check permissions\n");
-    exit(1);
-  }
-
-  // Create bitmask of numPossible triplets, of which M will be chosen
-  std::string bitmask(M, 1);  bitmask.resize(numPossible, 0);
-  auto cpuStart = std::chrono::high_resolution_clock::now();
-  while (1)
-  {
-    if (isRandom)
-    {
-      // Pick random number of simultaneous transfers to execute
-      // NOTE: This currently skews distribution due to some #s having more possibilities than others
-      M = distribution(*ev.generator);
-
-      // Generate a random bitmask
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-      std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
-    }
-
-    // Convert bitmask to list of Transfers
-    std::vector<Transfer> transfers;
-    for (int value = 0; value < numPossible; ++value)
-    {
-      if (bitmask[value])
-      {
-        // Convert integer value to (SRC->EXE->DST) triplet
-        Transfer transfer;
-        transfer.numSrcs        = 1;
-        transfer.numDsts        = 1;
-        transfer.srcType        = {possibleTransfers[value].srcType};
-        transfer.srcIndex       = {possibleTransfers[value].srcIndex};
-        transfer.exeType        = possibleTransfers[value].exeType;
-        transfer.exeIndex       = possibleTransfers[value].exeIndex;
-        transfer.exeSubIndex    = -1;
-        transfer.dstType        = {possibleTransfers[value].dstType};
-        transfer.dstIndex       = {possibleTransfers[value].dstIndex};
-        transfer.numSubExecs    = IsGpuType(transfer.exeType) ? numGpuSubExecs : numCpuSubExecs;
-        transfer.numBytes       = ev.sweepRandBytes ? randSize(*ev.generator) * sizeof(float) : 0;
-        transfers.push_back(transfer);
-      }
-    }
-
-    LogTransfers(fp, ++numTestsRun, transfers);
-    ExecuteTransfers(ev, numTestsRun, numBytesPerTransfer / sizeof(float), transfers);
-
-    // Check for test limit
-    if (numTestsRun == ev.sweepTestLimit)
-    {
-      printf("Test limit reached\n");
-      break;
-    }
-
-    // Check for time limit
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
-    if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
-    {
-      printf("Time limit exceeded\n");
-      break;
-    }
-
-    // Increment bitmask if not random sweep
-    if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
-    {
-      M++;
-      // Check for completion
-      if (M > maxParallelTransfers)
-      {
-        printf("Sweep complete\n");
-        break;
-      }
-      for (int i = 0; i < numPossible; i++)
-        bitmask[i] = (i < M) ? 1 : 0;
-    }
-  }
-  fclose(fp);
-}
-
-void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers)
-{
-  fprintf(fp, "# Test %d\n", testNum);
-  fprintf(fp, "%d", -1 * (int)transfers.size());
-  for (auto const& transfer : transfers)
-  {
-    fprintf(fp, " (%c%d->%c%d->%c%d %d %lu)",
-            MemTypeStr[transfer.srcType[0]], transfer.srcIndex[0],
-            ExeTypeStr[transfer.exeType],    transfer.exeIndex,
-            MemTypeStr[transfer.dstType[0]], transfer.dstIndex[0],
-            transfer.numSubExecs,
-            transfer.numBytes);
-  }
-  fprintf(fp, "\n");
-  fflush(fp);
-}
-
-std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset)
-{
-  std::stringstream ss;
-  for (int i = 0; i < strVector.size(); ++i)
-  {
-    if (i) ss << " ";
-    ss << (strVector[i] + initOffset);
-  }
-  return ss.str();
-}
-
-void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results)
-{
-  char sep = ev.outputToCsv ? ',' : '|';
-  size_t numTimedIterations = results.numTimedIterations;
-
-  // Loop over each executor
-  for (auto exeInfoPair : results.exeResults) {
-    ExeResult const& exeResult = exeInfoPair.second;
-    ExeType exeType  = exeInfoPair.first.first;
-    int     exeIndex = exeInfoPair.first.second;
-
-    printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
-           ExeTypeName[exeType], exeIndex, sep, exeResult.bandwidthGbs, sep,
-           exeResult.durationMsec, sep, exeResult.totalBytes, sep, exeResult.sumBandwidthGbs);
-
-    for (int idx : exeResult.transferIdx) {
-      Transfer const& t = transfers[idx];
-
-      char exeSubIndexStr[32] = "";
-      if (ev.useXccFilter || t.exeType == EXE_GPU_DMA) {
-        if (t.exeSubIndex == -1)
-          sprintf(exeSubIndexStr, ".*");
-        else
-          sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
-      }
-      printf("     Transfer %02d  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
-             t.transferIndex, sep,
-             t.transferBandwidth, sep,
-             t.transferTime, sep,
-             t.numBytesActual, sep,
-             t.SrcToStr().c_str(),
-             ExeTypeName[t.exeType], t.exeIndex,
-             exeSubIndexStr,
-             t.numSubExecs,
-             t.DstToStr().c_str());
-
-      // Show per-iteration timing information
-      if (ev.showIterations) {
-
-        std::set<std::pair<double, int>> times;
-        double stdDevTime = 0;
-        double stdDevBw = 0;
-        for (int i = 0; i < numTimedIterations; i++) {
-          times.insert(std::make_pair(t.perIterationTime[i], i+1));
-          double const varTime = fabs(t.transferTime - t.perIterationTime[i]);
-          stdDevTime += varTime * varTime;
-
-          double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / t.perIterationTime[i] * 1000.0f;
-          double const varBw = fabs(iterBandwidthGbs - t.transferBandwidth);
-          stdDevBw += varBw * varBw;
-        }
-        stdDevTime = sqrt(stdDevTime / numTimedIterations);
-        stdDevBw = sqrt(stdDevBw / numTimedIterations);
-
-        for (auto time : times) {
-          double iterDurationMsec = time.first;
-          double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
-          printf("      Iter %03d    %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
-
-          std::set<int> usedXccs;
-          if (time.second - 1 < t.perIterationCUs.size()) {
-            printf(" CUs:");
-            for (auto x : t.perIterationCUs[time.second - 1]) {
-              printf(" %02d:%02d", x.first, x.second);
-              usedXccs.insert(x.first);
-            }
-          }
-          printf(" XCCs:");
-          for (auto x : usedXccs)
-            printf(" %02d", x);
-          printf("\n");
-        }
-        printf("      StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
-      }
-    }
-  }
-
-  printf(" Aggregate (CPU)  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n", sep,
-         results.totalBandwidthCpu, sep, results.totalDurationMsec, sep, results.totalBytesTransferred, sep, results.overheadMsec);
-}
-
-void RunHealthCheck(EnvVars ev)
-{
-  // Check for supported platforms
-#if defined(__NVCC__)
-  printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
-  return;
-#else
-
-  bool hasFail = false;
-
-  // Force use of single stream
-  ev.useSingleStream = 1;
-
-  for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
-    hipDeviceProp_t prop;
-    HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
-    std::string fullName = prop.gcnArchName;
-    std::string archName = fullName.substr(0, fullName.find(':'));
-    if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
-    {
-      printf("[WARN] healthcheck preset is currently only supported on MI300 series hardware\n");
-      exit(1);
-    }
-  }
-
-  // Pass limits
-  double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
-  double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
-  double a2aLimit  = getenv("LIMIT_A2A")  ? atof(getenv("LIMIT_A2A"))  : (int)(45 * 0.95);
-
-  // Run CPU to GPU
-
-  // Run unidirectional read from CPU to GPU
-  printf("Testing unidirectional reads from CPU ");
-  {
-    std::vector<std::pair<int, double>> fails;
-    for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
-      std::vector<Transfer> transfers(1);
-      Transfer& t = transfers[0];
-      t.exeType     = EXE_GPU_GFX;
-      t.exeIndex    = gpuId;
-      t.numBytes    = 64*1024*1024;
-      t.numBytesActual = 64*1024*1024;
-      t.numSrcs     = 1;
-      t.srcType.push_back(MEM_CPU);
-      t.srcIndex.push_back(GetClosestNumaNode(gpuId));
-      t.numDsts     = 0;
-      t.dstType.clear();
-      t.dstIndex.clear();
-
-    // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t.numSubExecs = cu;
-        TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
-        bestResult = std::max(bestResult, t.transferBandwidth);
-        if (t.transferBandwidth >= udirLimit) {
-          passed = true;
-          break;
-      }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
-      }
-    }
-  }
-
-  // Run unidirectional write from GPU to CPU
-  printf("Testing unidirectional writes to  CPU ");
-  {
-    std::vector<std::pair<int, double>> fails;
-    for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
-      std::vector<Transfer> transfers(1);
-      Transfer& t = transfers[0];
-      t.exeType     = EXE_GPU_GFX;
-      t.exeIndex    = gpuId;
-      t.numBytes    = 64*1024*1024;
-      t.numBytesActual = 64*1024*1024;
-      t.numDsts     = 1;
-      t.dstType.push_back(MEM_CPU);
-      t.dstIndex.push_back(GetClosestNumaNode(gpuId));
-      t.numSrcs     = 0;
-      t.srcType.clear();
-      t.srcIndex.clear();
-
-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t.numSubExecs = cu;
-
-        TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
-        bestResult = std::max(bestResult, t.transferBandwidth);
-        if (t.transferBandwidth >= udirLimit) {
-          passed = true;
-          break;
-        }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
-      }
-    }
-  }
-
-  // Run bidirectional tests
-  printf("Testing bidirectional  reads + writes ");
-  {
-    std::vector<std::pair<int, double>> fails;
-    for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
-      std::vector<Transfer> transfers(2);
-      Transfer& t0 = transfers[0];
-      Transfer& t1 = transfers[1];
-
-      t0.exeType     = EXE_GPU_GFX;
-      t0.exeIndex    = gpuId;
-      t0.numBytes    = 64*1024*1024;
-      t0.numBytesActual = 64*1024*1024;
-      t0.numSrcs     = 1;
-      t0.srcType.push_back(MEM_CPU);
-      t0.srcIndex.push_back(GetClosestNumaNode(gpuId));
-      t0.numDsts     = 0;
-      t0.dstType.clear();
-      t0.dstIndex.clear();
-
-      t1.exeType     = EXE_GPU_GFX;
-      t1.exeIndex    = gpuId;
-      t1.numBytes    = 64*1024*1024;
-      t1.numBytesActual = 64*1024*1024;
-      t1.numDsts     = 1;
-      t1.dstType.push_back(MEM_CPU);
-      t1.dstIndex.push_back(GetClosestNumaNode(gpuId));
-      t1.numSrcs     = 0;
-      t1.srcType.clear();
-      t1.srcIndex.clear();
-
-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t0.numSubExecs = cu;
-        t1.numSubExecs = cu;
-
-        TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
-        double sum = t0.transferBandwidth + t1.transferBandwidth;
-        bestResult = std::max(bestResult, sum);
-        if (sum >= bdirLimit) {
-          passed = true;
-          break;
-        }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
-      }
-    }
-  }
-
-  // Run XGMI tests:
-  printf("Testing all-to-all XGMI copies        "); fflush(stdout);
-  {
-    ev.gfxUnroll = 2;
-    std::vector<Transfer> transfers;
-    for (int i = 0; i < ev.numGpuDevices; i++) {
-      for (int j = 0; j < ev.numGpuDevices; j++) {
-        if (i == j) continue;
-        Transfer t;
-        t.exeType = EXE_GPU_GFX;
-        t.exeIndex = i;
-        t.numBytes = t.numBytesActual = 64*1024*1024;
-        t.numSrcs = 1;
-        t.numDsts = 1;
-        t.numSubExecs = 8;
-        t.srcType.push_back(MEM_GPU_FINE);
-        t.dstType.push_back(MEM_GPU_FINE);
-        t.srcIndex.push_back(i);
-        t.dstIndex.push_back(j);
-        transfers.push_back(t);
-      }
-    }
-    TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
-    std::vector<std::pair<std::pair<int,int>, double>> fails;
-    int transferIdx = 0;
-    for (int i = 0; i < ev.numGpuDevices; i++) {
-      printf("."); fflush(stdout);
-      for (int j = 0; j < ev.numGpuDevices; j++) {
-        if (i == j) continue;
-        Transfer const& t = transfers[transferIdx];
-        if (t.transferBandwidth < a2aLimit) {
-          fails.push_back(std::make_pair(std::make_pair(i,j), t.transferBandwidth));
-        }
-        transferIdx++;
-      }
-    }
-    if (fails.size() == 0) {
-      printf("PASS\n");
-    } else {
-      hasFail = true;
-      printf("FAIL (%lu test(s))\n", fails.size());
-      for (auto p : fails) {
-        printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
-      }
-    }
-  }
-
-  exit(hasFail ? 1 : 0);
-#endif
-}
--- a/src/client/Client.cpp
+++ b/src/client/Client.cpp
+/*
+Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "Client.hpp"
+#include "Presets.hpp"
+#include "Topology.hpp"
+#include <fstream>
+
+int main(int argc, char **argv) {
+
+  // Collect environment variables
+  EnvVars ev;
+
+  // Display usage instructions and detected topology
+  if (argc <= 1) {
+    if (!ev.outputToCsv) {
+      DisplayUsage(argv[0]);
+      DisplayPresets();
+    }
+    DisplayTopology(ev.outputToCsv);
+    exit(0);
+  }
+
+  // Determine number of bytes to run per Transfer
+  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
+  if (argc > 2) {
+    // Adjust bytes if unit specified
+    char units = argv[2][strlen(argv[2])-1];
+    switch (units) {
+    case 'G': case 'g': numBytesPerTransfer *= 1024;
+    case 'M': case 'm': numBytesPerTransfer *= 1024;
+    case 'K': case 'k': numBytesPerTransfer *= 1024;
+    }
+  }
+  if (numBytesPerTransfer % 4) {
+    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
+    exit(1);
+  }
+
+  // Run preset benchmark if requested
+  if (RunPreset(ev, numBytesPerTransfer, argc, argv)) exit(0);
+
+  // Read input from command line or configuration file
+  std::vector<std::string> lines;
+  {
+    std::string line;
+    if (!strcmp(argv[1], "cmdline")) {
+      for (int i = 3; i < argc; i++)
+        line += std::string(argv[i]) + " ";
+      lines.push_back(line);
+    } else {
+      std::ifstream cfgFile(argv[1]);
+      if (!cfgFile.is_open()) {
+        printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
+        exit(1);
+      }
+      while (std::getline(cfgFile, line))
+        lines.push_back(line);
+      cfgFile.close();
+    }
+  }
+
+  // Print environment variables and CSV header
+  ev.DisplayEnvVars();
+  if (ev.outputToCsv)
+    printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
+
+  TransferBench::ConfigOptions cfgOptions = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  std::vector<ErrResult> errors;
+
+  // Process each line as a Test
+  int testNum = 0;
+  for (std::string const &line : lines) {
+    // Check if line is a comment to be echoed to output (starts with ##)
+    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s\n", line.c_str());
+
+    // Parse set of parallel Transfers to execute
+    std::vector<Transfer> transfers;
+    CheckForError(TransferBench::ParseTransfers(line, transfers));
+    if (transfers.empty()) continue;
+
+    // Check for variable sub-executors Transfers
+    int numVariableTransfers = 0;
+    int maxVarCount = 0;
+    {
+      std::map<ExeDevice, int> varTransferCount;
+      for (auto const& t : transfers) {
+        if (t.numSubExecs == 0) {
+          if (t.exeDevice.exeType != EXE_GPU_GFX) {
+            printf("[ERROR] Variable number of subexecutors is only supported on GFX executors\n");
+            exit(1);
+          }
+          numVariableTransfers++;
+          varTransferCount[t.exeDevice]++;
+          maxVarCount = max(maxVarCount, varTransferCount[t.exeDevice]);
+        }
+      }
+      if (numVariableTransfers > 0 && numVariableTransfers != transfers.size()) {
+        printf("[ERROR] All or none of the Transfers in the Test must use variable number of Subexecutors\n");
+        exit(1);
+      }
+    }
+
+    // Run the specified numbers of bytes otherwise generate a range of values
+    for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
+      size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
+      size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
+      do {
+        for (auto& t : transfers)
+          t.numBytes = currBytes;
+
+        if (maxVarCount == 0) {
+          if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
+            PrintResults(ev, ++testNum, transfers, results);
+          }
+          PrintErrors(results.errResults);
+        } else {
+          // Variable subexecutors - Determine how many subexecutors to sweep up to
+          int maxNumVarSubExec = ev.maxNumVarSubExec;
+          if (maxNumVarSubExec == 0) {
+            maxNumVarSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}) / maxVarCount;
+          }
+
+          TransferBench::TestResults bestResults;
+          std::vector<Transfer> bestTransfers;
+          for (int numSubExecs = ev.minNumVarSubExec; numSubExecs <= maxNumVarSubExec; numSubExecs++) {
+            std::vector<Transfer> tempTransfers = transfers;
+            for (auto& t : tempTransfers) {
+              if (t.numSubExecs == 0) t.numSubExecs = numSubExecs;
+            }
+
+            TransferBench::TestResults tempResults;
+            if (!TransferBench::RunTransfers(cfgOptions, tempTransfers, tempResults)) {
+              PrintErrors(tempResults.errResults);
+            } else {
+              if (tempResults.avgTotalBandwidthGbPerSec > bestResults.avgTotalBandwidthGbPerSec) {
+                bestResults = tempResults;
+                bestTransfers = tempTransfers;
+              }
+            }
+          }
+          PrintResults(ev, ++testNum, bestTransfers, bestResults);
+          PrintErrors(bestResults.errResults);
+        }
+        if (numBytesPerTransfer != 0) break;
+        currBytes += deltaBytes;
+      } while (currBytes < bytes * 2);
+      if (numBytesPerTransfer != 0) break;
+    }
+  }
+}
+
+void DisplayUsage(char const* cmdName)
+{
+  printf("TransferBench v%s.%s\n", TransferBench::VERSION, CLIENT_VERSION);
+  printf("========================================\n");
+
+  if (numa_available() == -1) {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+
+  printf("Usage: %s config <N>\n", cmdName);
+  printf("  config: Either:\n");
+  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
+  printf("          - Name of preset config:\n");
+  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
+  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
+         DEFAULT_BYTES_PER_TRANSFER);
+  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
+  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
+  printf("\n");
+
+  EnvVars::DisplayUsage();
+}
+
+std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices) {
+  if (memDevices.empty()) return "N";
+  std::stringstream ss;
+  for (auto const& m : memDevices)
+    ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
+  return ss.str();
+}
+
+void PrintResults(EnvVars const& ev, int const testNum,
+                  std::vector<Transfer> const& transfers,
+                  TransferBench::TestResults const& results)
+{
+  char sep = ev.outputToCsv ? ',' : '|';
+  size_t numTimedIterations = results.numTimedIterations;
+
+  if (!ev.outputToCsv) printf("Test %d:\n", testNum);
+
+  // Loop over each executor
+  for (auto exeInfoPair : results.exeResults) {
+    ExeDevice const& exeDevice = exeInfoPair.first;
+    ExeResult const& exeResult = exeInfoPair.second;
+    ExeType const    exeType   = exeDevice.exeType;
+    int32_t const    exeIndex  = exeDevice.exeIndex;
+
+    printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
+           ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
+           exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);
+
+    // Loop over each executor
+    for (int idx : exeResult.transferIdx) {
+      Transfer const& t = transfers[idx];
+      TransferResult const& r = results.tfrResults[idx];
+
+      char exeSubIndexStr[32] = "";
+      if (t.exeSubIndex != -1)
+        sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
+
+      printf("     Transfer %02d  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
+             idx,                    sep,
+             r.avgBandwidthGbPerSec, sep,
+             r.avgDurationMsec,      sep,
+             r.numBytes,             sep,
+             MemDevicesToStr(t.srcs).c_str(), ExeTypeName[exeType], exeIndex,
+             exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
+
+      // Show per-iteration timing information
+      if (ev.showIterations) {
+
+        // Check that per-iteration information exists
+        if (r.perIterMsec.size() != numTimedIterations) {
+          printf("[ERROR] Per iteration timing data unavailable: Expected %lu data points, but have %lu\n",
+                 numTimedIterations, r.perIterMsec.size());
+          exit(1);
+        }
+
+        // Compute standard deviation and track iterations by speed
+        std::set<std::pair<double, int>> times;
+        double stdDevTime = 0;
+        double stdDevBw = 0;
+        for (int i = 0; i < numTimedIterations; i++) {
+          times.insert(std::make_pair(r.perIterMsec[i], i+1));
+          double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
+          stdDevTime += varTime * varTime;
+
+          double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
+          double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
+          stdDevBw += varBw * varBw;
+        }
+        stdDevTime = sqrt(stdDevTime / numTimedIterations);
+        stdDevBw = sqrt(stdDevBw / numTimedIterations);
+
+        // Loop over iterations (fastest to slowest)
+        for (auto& time : times) {
+          double iterDurationMsec = time.first;
+          double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
+          printf("      Iter %03d    %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
+
+          std::set<int> usedXccs;
+          if (time.second - 1 < r.perIterCUs.size()) {
+            printf(" CUs:");
+            for (auto x : r.perIterCUs[time.second - 1]) {
+              printf(" %02d:%02d", x.first, x.second);
+              usedXccs.insert(x.first);
+            }
+          }
+
+          printf(" XCCs:");
+          for (auto x : usedXccs)
+            printf(" %02d", x);
+          printf("\n");
+        }
+        printf("      StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
+      }
+    }
+  }
+  printf(" Aggregate (CPU)  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
+         sep, results.avgTotalBandwidthGbPerSec,
+         sep, results.avgTotalDurationMsec,
+         sep, results.totalBytesTransferred,
+         sep, results.overheadMsec);
+}
+
+void CheckForError(ErrResult const& error)
+{
+  switch (error.errType) {
+  case ERR_NONE: return;
+  case ERR_WARN:
+    printf("[WARN] %s\n", error.errMsg.c_str());
+    return;
+  case ERR_FATAL:
+    printf("[ERROR] %s\n", error.errMsg.c_str());
+    exit(1);
+  default:
+    break;
+  }
+}
+
+void PrintErrors(std::vector<ErrResult> const& errors)
+{
+  bool isFatal = false;
+  for (auto const& err : errors) {
+    printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
+    isFatal |= (err.errType == ERR_FATAL);
+  }
+  if (isFatal) exit(1);
+}
--- a/src/client/Client.hpp
+++ b/src/client/Client.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// TransferBench client version
+#define CLIENT_VERSION "00"
+
+#include "TransferBench.hpp"
+#include "EnvVars.hpp"
+
+size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);
+
+char const ExeTypeName[4][4] = {"CPU", "GPU", "DMA", "IBV"};
+
+// Display detected hardware
+void DisplayTopology(bool outputToCsv);
+
+// Display usage instructions
+void DisplayUsage(char const* cmdName);
+
+// Print TransferBench test results
+void PrintResults(EnvVars const& ev, int const testNum,
+                  std::vector<Transfer> const& transfers,
+                  TransferBench::TestResults const& results);
+
+// Helper function that converts MemDevices to a string
+std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices);
+
+// Helper function to print warning / exit on fatal error
+void CheckForError(ErrResult const& error);
+
+// Helper function to print list of errors
+void PrintErrors(std::vector<ErrResult> const& errors);
--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
+/*
+Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ENVVARS_HPP
+#define ENVVARS_HPP
+
+// Helper macro for catching HIP errors
+#define HIP_CALL(cmd)                                                           \
+  do {                                                                          \
+    hipError_t error = (cmd);                                                   \
+    if (error != hipSuccess) {                                                  \
+      std::cerr << "Encountered HIP error (" << hipGetErrorString(error)        \
+                << ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
+      exit(-1);                                                                 \
+    }                                                                           \
+  } while (0)
+
+#include <algorithm>
+#include <iostream>
+#include <numa.h>
+#include <random>
+#include <time.h>
+#include "Client.hpp"
+
+#include "TransferBench.hpp"
+using namespace TransferBench;
+
+// Redefinitions for CUDA compatibility
+//==========================================================================================
+#if defined(__NVCC__)
+  #define hipError_t                                         cudaError_t
+  #define hipGetErrorString                                  cudaGetErrorString
+  #define hipDeviceProp_t                                    cudaDeviceProp
+  #define hipDeviceGetPCIBusId                               cudaDeviceGetPCIBusId
+  #define hipGetDeviceProperties                             cudaGetDeviceProperties
+  #define hipSuccess                                         cudaSuccess
+  #define gcnArchName                                        name
+  #define hipGetDeviceCount                                  cudaGetDeviceCount
+#endif
+
+// This class manages environment variable that affect TransferBench
+class EnvVars
+{
+public:
+  // Default configuration values
+  int const DEFAULT_SAMPLING_FACTOR = 1;
+
+  // Environment variables
+  // General options
+  int numIterations;                 // Number of timed iterations to perform.  If negative, run for -numIterations seconds instead
+  int numSubIterations;              // Number of subiterations to perform
+  int numWarmups;                    // Number of un-timed warmup iterations to perform
+  int showIterations;                // Show per-iteration timing info
+  int useInteractive;                // Pause for user-input before starting transfer loop
+
+  // Data options
+  int alwaysValidate;                // Validate after each iteration instead of once after all iterations
+  int blockBytes;                    // Each subexecutor, except the last, gets a multiple of this many bytes to copy
+  int byteOffset;                    // Byte-offset for memory allocations
+  vector<float> fillPattern;         // Pattern of floats used to fill source data
+  int validateDirect;                // Validate GPU destination memory directly instead of staging GPU memory on host
+  int validateSource;                // Validate source GPU memory immediately after preparation
+
+  // DMA options
+  int useHsaDma;                     // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
+
+  // GFX options
+  int gfxBlockSize;                  // Size of each threadblock (must be multiple of 64)
+  vector<uint32_t> cuMask;           // Bit-vector representing the CU mask
+  vector<vector<int>> prefXccTable;  // Specifies XCC to use for given exe->dst pair
+  int gfxUnroll;                     // GFX-kernel unroll factor
+  int useHipEvents;                  // Use HIP events for timing GFX/DMA Executor
+  int useSingleStream;               // Use a single stream per GPU GFX executor instead of stream per Transfer
+  int gfxSingleTeam;                 // Team all subExecutors across the data array
+  int gfxWaveOrder;                  // GFX-kernel wavefront ordering
+
+  // Client options
+  int hideEnv;                       // Skip printing environment variable
+  int minNumVarSubExec;              // Minimum # of subexecutors to use for variable subExec Transfers
+  int maxNumVarSubExec;              // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
+  int outputToCsv;                   // Output in CSV format
+  int samplingFactor;                // Affects how many different values of N are generated (when N set to 0)
+
+  // Developer features
+  int gpuMaxHwQueues;                // Tracks GPU_MAX_HW_QUEUES environment variable
+
+  // Constructor that collects values
+  EnvVars()
+  {
+    int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    int numDeviceCUs    = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0});
+
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, 0));
+    std::string fullName = prop.gcnArchName;
+    std::string archName = fullName.substr(0, fullName.find(':'));
+
+    // Different hardware pick different GPU kernels
+    // This performance difference is generally only noticable when executing fewer CUs
+    int defaultGfxUnroll = 4;
+    if      (archName == "gfx906") defaultGfxUnroll = 8;
+    else if (archName == "gfx90a") defaultGfxUnroll = 8;
+    else if (archName == "gfx940") defaultGfxUnroll = 6;
+    else if (archName == "gfx941") defaultGfxUnroll = 6;
+    else if (archName == "gfx942") defaultGfxUnroll = 4;
+
+    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
+    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
+    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
+    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 0);
+    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
+    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
+    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
+    minNumVarSubExec  = GetEnvVar("MIN_VAR_SUBEXEC"     , 1);
+    maxNumVarSubExec  = GetEnvVar("MAX_VAR_SUBEXEC"     , 0);
+    numIterations     = GetEnvVar("NUM_ITERATIONS"      , 10);
+    numSubIterations  = GetEnvVar("NUM_SUBITERATIONS"   , 1);
+    numWarmups        = GetEnvVar("NUM_WARMUPS"         , 3);
+    outputToCsv       = GetEnvVar("OUTPUT_TO_CSV"       , 0);
+    samplingFactor    = GetEnvVar("SAMPLING_FACTOR"     , 1);
+    showIterations    = GetEnvVar("SHOW_ITERATIONS"     , 0);
+    useHipEvents      = GetEnvVar("USE_HIP_EVENTS"      , 1);
+    useHsaDma         = GetEnvVar("USE_HSA_DMA"         , 0);
+    useInteractive    = GetEnvVar("USE_INTERACTIVE"     , 0);
+    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 1);
+    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
+    validateSource    = GetEnvVar("VALIDATE_SOURCE"     , 0);
+
+    gpuMaxHwQueues    = GetEnvVar("GPU_MAX_HW_QUEUES"   , 4);
+
+    // Check for fill pattern
+    char* pattern = getenv("FILL_PATTERN");
+    if (pattern != NULL) {
+      int patternLen = strlen(pattern);
+      if (patternLen % 2) {
+        printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits\n");
+        exit(1);
+      }
+
+      // Read in bytes
+      std::vector<unsigned char> bytes;
+      unsigned char val = 0;
+      for (int i = 0; i < patternLen; i++) {
+        if ('0' <= pattern[i] && pattern[i] <= '9')
+          val += (pattern[i] - '0');
+        else if ('A' <= pattern[i] && pattern[i] <= 'F')
+          val += (pattern[i] - 'A' + 10);
+        else if ('a' <= pattern[i] && pattern[i] <= 'f')
+          val += (pattern[i] - 'a' + 10);
+        else {
+          printf("[ERROR] FILL_PATTERN must contain an even-number of hex digits (0-9'/a-f/A-F).  (not %c)\n", pattern[i]);
+          exit(1);
+        }
+
+        if (i % 2 == 0)
+          val <<= 4;
+        else {
+          bytes.push_back(val);
+          val = 0;
+        }
+      }
+
+      // Reverse bytes (input is assumed to be given in big-endian)
+      std::reverse(bytes.begin(), bytes.end());
+
+      // Figure out how many copies of the pattern are necessary to fill a 4-byte float properly
+      int copies;
+      switch (patternLen % 8) {
+      case 0:  copies = 1; break;
+      case 4:  copies = 2; break;
+      default: copies = 4; break;
+      }
+
+      // Fill floats
+      int numFloats = copies * patternLen / 8;
+      fillPattern.resize(numFloats);
+      unsigned char* rawData = (unsigned char*) fillPattern.data();
+      for (int i = 0; i < numFloats * 4; i++)
+        rawData[i] = bytes[i % bytes.size()];
+    }
+    else fillPattern.clear();
+
+    // Check for CU mask
+    int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
+    cuMask.clear();
+    char* cuMaskStr = getenv("CU_MASK");
+    if (cuMaskStr != NULL) {
+#if defined(__NVCC__)
+      printf("[WARN] CU_MASK is not supported in CUDA\n");
+#else
+      std::vector<std::pair<int, int>> ranges;
+      int maxCU = 0;
+      char* token = strtok(cuMaskStr, ",");
+      while (token) {
+        int start, end;
+        if (sscanf(token, "%d-%d", &start, &end) == 2) {
+          ranges.push_back(std::make_pair(std::min(start, end), std::max(start, end)));
+          maxCU = std::max(maxCU, std::max(start, end));
+        } else if (sscanf(token, "%d", &start) == 1) {
+          ranges.push_back(std::make_pair(start, start));
+          maxCU = std::max(maxCU, start);
+        } else {
+          printf("[ERROR] Unrecognized token [%s]\n", token);
+          exit(1);
+        }
+        token = strtok(NULL, ",");
+      }
+      cuMask.resize(2 * numXccs, 0);
+
+      for (auto range : ranges) {
+        for (int i = range.first; i <= range.second; i++) {
+          for (int x = 0; x < numXccs; x++) {
+            int targetBit = i * numXccs + x;
+            cuMask[targetBit/32] |= (1<<(targetBit%32));
+          }
+        }
+      }
+#endif
+    }
+
+    // Parse preferred XCC table (if provided)
+    char* prefXccStr = getenv("XCC_PREF_TABLE");
+    if (prefXccStr) {
+      prefXccTable.resize(numDetectedGpus);
+      for (int i = 0; i < numDetectedGpus; i++){
+        prefXccTable[i].resize(numDetectedGpus, -1);
+      }
+      char* token = strtok(prefXccStr, ",");
+      int tokenCount = 0;
+      while (token) {
+        int xccId;
+        if (sscanf(token, "%d", &xccId) == 1) {
+          int src = tokenCount / numDetectedGpus;
+          int dst = tokenCount % numDetectedGpus;
+          if (xccId < 0 || xccId >= numXccs) {
+            printf("[ERROR] XCC index (%d) out of bounds. Expect value less than %d\n", xccId, numXccs);
+            exit(1);
+          }
+          prefXccTable[src][dst] = xccId;
+
+          tokenCount++;
+          if (tokenCount == (numDetectedGpus * numDetectedGpus)) break;
+        } else {
+          printf("[ERROR] Unrecognized token [%s]\n", token);
+          exit(1);
+        }
+        token = strtok(NULL, ",");
+      }
+    }
+  }
+
+  // Display info on the env vars that can be used
+  static void DisplayUsage()
+  {
+    printf("Environment variables:\n");
+    printf("======================\n");
+    printf(" ALWAYS_VALIDATE   - Validate after each iteration instead of once after all iterations\n");
+    printf(" BLOCK_SIZE        - # of threads per threadblock (Must be multiple of 64)\n");
+    printf(" BLOCK_BYTES       - Controls granularity of how work is divided across subExecutors\n");
+    printf(" BYTE_OFFSET       - Initial byte-offset for memory allocations.  Must be multiple of 4\n");
+    printf(" CU_MASK           - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
+    printf(" FILL_PATTERN      - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
+    printf(" GFX_UNROLL        - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
+    printf(" GFX_SINGLE_TEAM   - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
+    printf(" GFX_WAVE_ORDER    - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
+    printf(" HIDE_ENV          - Hide environment variable value listing\n");
+    printf(" MIN_VAR_SUBEXEC   - Minumum # of subexecutors to use for variable subExec Transfers\n");
+    printf(" MAX_VAR_SUBEXEC   - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
+    printf(" NUM_ITERATIONS    - # of timed iterations per test. If negative, run for this many seconds instead\n");
+    printf(" NUM_SUBITERATIONS - # of sub-iterations to run per iteration. Must be non-negative\n");
+    printf(" NUM_WARMUPS       - # of untimed warmup iterations per test\n");
+    printf(" OUTPUT_TO_CSV     - Outputs to CSV format if set\n");
+    printf(" SAMPLING_FACTOR   - Add this many samples (when possible) between powers of 2 when auto-generating data sizes\n");
+    printf(" SHOW_ITERATIONS   - Show per-iteration timing info\n");
+    printf(" USE_HIP_EVENTS    - Use HIP events for GFX executor timing\n");
+    printf(" USE_HSA_DMA       - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
+    printf(" USE_INTERACTIVE   - Pause for user-input before starting transfer loop\n");
+    printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
+    printf(" VALIDATE_DIRECT   - Validate GPU destination memory directly instead of staging GPU memory on host\n");
+    printf(" VALIDATE_SOURCE   - Validate GPU src memory immediately after preparation\n");
+  }
+
+  void Print(std::string const& name, int32_t const value, const char* format, ...) const
+  {
+    printf("%-20s%s%12d%s", name.c_str(), outputToCsv ? "," : " = ", value, outputToCsv ? "," : " : ");
+    va_list args;
+    va_start(args, format);
+    vprintf(format, args);
+    va_end(args);
+    printf("\n");
+  }
+
+  void Print(std::string const& name, std::string const& value, const char* format, ...) const
+  {
+    printf("%-20s%s%12s%s", name.c_str(), outputToCsv ? "," : " = ", value.c_str(), outputToCsv ? "," : " : ");
+    va_list args;
+    va_start(args, format);
+    vprintf(format, args);
+    va_end(args);
+    printf("\n");
+  }
+
+  // Display env var settings
+  void DisplayEnvVars() const
+  {
+    int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+    if (!outputToCsv) {
+      printf("TransferBench v%s.%s\n", TransferBench::VERSION, CLIENT_VERSION);
+      printf("===============================================================\n");
+      if (!hideEnv) printf("[Common]                              (Suppress by setting HIDE_ENV=1)\n");
+    }
+    else if (!hideEnv)
+      printf("EnvVar,Value,Description,(TransferBench Client v%s Backend v%s)\n", CLIENT_VERSION, TransferBench::VERSION);
+    if (hideEnv) return;
+
+    Print("ALWAYS_VALIDATE", alwaysValidate,
+          "Validating after %s", (alwaysValidate ? "each iteration" : "all iterations"));
+    Print("BLOCK_BYTES", blockBytes,
+          "Each CU gets a mulitple of %d bytes to copy", blockBytes);
+    Print("BYTE_OFFSET", byteOffset,
+          "Using byte offset of %d", byteOffset);
+    Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
+          "%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
+    Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
+          "%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
+    Print("GFX_BLOCK_SIZE", gfxBlockSize,
+          "Threadblock size of %d", gfxBlockSize);
+    Print("GFX_SINGLE_TEAM", gfxSingleTeam,
+          "%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
+                                 "Each CUs operates on its own disjoint subarray"));
+    Print("GFX_UNROLL", gfxUnroll,
+          "Using GFX unroll factor of %d", gfxUnroll);
+    Print("GFX_WAVE_ORDER", gfxWaveOrder,
+          "Using GFX wave ordering of %s", (gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
+                                            gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
+                                            gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
+                                            gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
+                                            gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
+                                                                "CU,Wavefront,Unroll"));
+    Print("MIN_VAR_SUBEXEC", minNumVarSubExec,
+          "Using at least %d subexecutor(s) for variable subExec tranfers", minNumVarSubExec);
+    Print("MAX_VAR_SUBEXEC", maxNumVarSubExec,
+          "Using up to %s subexecutors for variable subExec transfers",
+          maxNumVarSubExec ? std::to_string(maxNumVarSubExec).c_str() : "all available");
+    Print("NUM_ITERATIONS", numIterations,
+          (numIterations == 0) ? "Running infinitely" :
+          "Running %d %s", abs(numIterations), (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
+    Print("NUM_SUBITERATIONS", numSubIterations,
+          "Running %s subiterations", (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)).c_str());
+    Print("NUM_WARMUPS", numWarmups,
+          "Running %d warmup iteration(s) per Test", numWarmups);
+    Print("SHOW_ITERATIONS", showIterations,
+          "%s per-iteration timing", showIterations ? "Showing" : "Hiding");
+    Print("USE_HIP_EVENTS", useHipEvents,
+          "Using %s for GFX/DMA Executor timing", useHipEvents ? "HIP events" : "CPU wall time");
+    Print("USE_HSA_DMA", useHsaDma,
+          "Using %s for DMA execution", useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync");
+    Print("USE_INTERACTIVE", useInteractive,
+          "Running in %s mode", useInteractive ? "interactive" : "non-interactive");
+    Print("USE_SINGLE_STREAM", useSingleStream,
+          "Using single stream per GFX %s", useSingleStream ? "device" : "Transfer");
+
+    if (getenv("XCC_PREF_TABLE")) {
+      printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
+      printf("%36s:         ", "");
+      for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
+      for (int i = 0; i < numGpuDevices; i++) {
+        printf("%36s: GPU %3d ", "", i);
+        for (int j = 0; j < numGpuDevices; j++)
+          printf(" %3d", prefXccTable[i][j]);
+        printf(" %3d\n", TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}));
+      }
+    }
+    Print("VALIDATE_DIRECT", validateDirect,
+          "Validate GPU destination memory %s", validateDirect ? "directly" : "via CPU staging buffer");
+    Print("VALIDATE_SOURCE", validateSource,
+          validateSource ? "Validate source after preparation" : "Do not perform source validation after prep");
+    printf("\n");
+  };
+
+  // Helper function that gets parses environment variable or sets to default value
+  static int GetEnvVar(std::string const& varname, int defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return atoi(getenv(varname.c_str()));
+    return defaultValue;
+  }
+
+  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
+  {
+    if (getenv(varname.c_str()))
+      return getenv(varname.c_str());
+    return defaultValue;
+  }
+
+  std::string GetCuMaskDesc() const
+  {
+    std::vector<std::pair<int, int>> runs;
+    int numXccs = TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
+    bool inRun = false;
+    std::pair<int, int> curr;
+    int used = 0;
+    for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
+      if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
+        used++;
+        if (!inRun) {
+          inRun = true;
+          curr.first = targetBit / numXccs;
+        }
+      } else {
+        if (inRun) {
+          inRun = false;
+          curr.second = targetBit / numXccs - 1;
+          runs.push_back(curr);
+        }
+      }
+    }
+    if (inRun)
+      curr.second = (cuMask.size() * 32) / numXccs - 1;
+
+    std::string result = "CUs used: (" + std::to_string(used) + ") ";
+    for (int i = 0; i < runs.size(); i++)
+    {
+      if (i) result += ",";
+      if (runs[i].first == runs[i].second) result += std::to_string(runs[i].first);
+      else result += std::to_string(runs[i].first) + "-" + std::to_string(runs[i].second);
+    }
+    return result;
+  }
+
+  TransferBench::ConfigOptions ToConfigOptions()
+  {
+    TransferBench::ConfigOptions cfg;
+
+    cfg.general.numIterations      = numIterations;
+    cfg.general.numSubIterations   = numSubIterations;
+    cfg.general.numWarmups         = numWarmups;
+    cfg.general.recordPerIteration = showIterations;
+    cfg.general.useInteractive     = useInteractive;
+
+    cfg.data.alwaysValidate        = alwaysValidate;
+    cfg.data.blockBytes            = blockBytes;
+    cfg.data.byteOffset            = byteOffset;
+    cfg.data.validateDirect        = validateDirect;
+    cfg.data.validateSource        = validateSource;
+    cfg.data.fillPattern           = fillPattern;
+
+    cfg.dma.useHipEvents           = useHipEvents;
+    cfg.dma.useHsaCopy             = useHsaDma;
+
+    cfg.gfx.blockSize              = gfxBlockSize;
+    cfg.gfx.cuMask                 = cuMask;
+    cfg.gfx.prefXccTable           = prefXccTable;
+    cfg.gfx.unrollFactor           = gfxUnroll;
+    cfg.gfx.useHipEvents           = useHipEvents;
+    cfg.gfx.useMultiStream         = !useSingleStream;
+    cfg.gfx.useSingleTeam          = gfxSingleTeam;
+    cfg.gfx.waveOrder              = gfxWaveOrder;
+
+    return cfg;
+  }
+};
+
+#endif
--- a/src/client/Presets/AllToAll.hpp
+++ b/src/client/Presets/AllToAll.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "EnvVars.hpp"
+
+void AllToAllPreset(EnvVars&           ev,
+                    size_t      const  numBytesPerTransfer,
+                    std::string const  presetName)
+{
+  enum
+  {
+    A2A_COPY       = 0,
+    A2A_READ_ONLY  = 1,
+    A2A_WRITE_ONLY = 2
+  };
+  char a2aModeStr[3][20] = {"Copy", "Read-Only", "Write-Only"};
+
+  // Force single-stream mode for all-to-all benchmark
+  ev.useSingleStream = 1;
+
+  // Force to gfx unroll 2 unless explicitly set
+  ev.gfxUnroll      = EnvVars::GetEnvVar("GFX_UNROLL", 2);
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT"     , 1);
+  int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
+  int a2aMode       = EnvVars::GetEnvVar("A2A_MODE"       , 0);
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC"   , 8);
+  int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
+  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
+  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+
+  // Print off environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[AllToAll Related]\n");
+    ev.Print("A2A_DIRECT"     , a2aDirect    , a2aDirect ? "Only using direct links" : "Full all-to-all");
+    ev.Print("A2A_LOCAL"      , a2aLocal     , "%s local transfers", a2aLocal ? "Include" : "Exclude");
+    ev.Print("A2A_MODE"       , a2aMode      , a2aModeStr[a2aMode]);
+    ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
+    ev.Print("NUM_SUB_EXEC"   , numSubExecs  , "Using %d subexecutors/CUs per Transfer", numSubExecs);
+    ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
+    ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+    printf("\n");
+  }
+
+  // Validate env vars
+  if (a2aMode < 0 || a2aMode > 2) {
+    printf("[ERROR] a2aMode must be between 0 and 2\n");
+    exit(1);
+  }
+  if (numGpus < 0 || numGpus > numDetectedGpus) {
+    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    exit(1);
+  }
+
+  // Collect the number of GPU devices to use
+  int const numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
+  int const numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
+
+  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+  ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
+
+  std::map<std::pair<int, int>, int> reIndex;
+  std::vector<Transfer> transfers;
+  for (int i = 0; i < numGpus; i++) {
+    for (int j = 0; j < numGpus; j++) {
+
+      // Check whether or not to execute this pair
+      if (i == j) {
+        if (!a2aLocal) continue;
+      } else if (a2aDirect) {
+#if !defined(__NVCC__)
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+        if (hopCount != 1) continue;
+#endif
+      }
+
+      // Build Transfer and add it to list
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      if (numSrcs) transfer.srcs.push_back({memType, i});
+      if (numDsts) transfer.dsts.push_back({memType, j});
+      transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
+      transfer.exeSubIndex = -1;
+      transfer.numSubExecs = numSubExecs;
+
+      reIndex[std::make_pair(i,j)] = transfers.size();
+      transfers.push_back(transfer);
+    }
+  }
+
+  printf("GPU-GFX All-To-All benchmark:\n");
+  printf("==========================\n");
+  printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
+         numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
+  if (transfers.size() == 0) return;
+
+  // Execute Transfers
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+    for (auto const& err : results.errResults)
+      printf("%s\n", err.errMsg.c_str());
+    exit(0);
+  } else {
+    PrintResults(ev, 1, transfers, results);
+  }
+
+  // Print results
+  char separator = (ev.outputToCsv ? ',' : ' ');
+  printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
+  printf("==========================================================\n");
+  printf("SRC\\DST ");
+  for (int dst = 0; dst < numGpus; dst++)
+    printf("%cGPU %02d    ", separator, dst);
+  printf("   %cSTotal     %cActual\n", separator, separator);
+
+  double totalBandwidthGpu = 0.0;
+  double minExecutorBandwidth = std::numeric_limits<double>::max();
+  double maxExecutorBandwidth = 0.0;
+  std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
+  for (int src = 0; src < numGpus; src++) {
+    double rowTotalBandwidth = 0;
+    double executorBandwidth = 0;
+    printf("GPU %02d", src);
+    for (int dst = 0; dst < numGpus; dst++) {
+      if (reIndex.count(std::make_pair(src, dst))) {
+        int const transferIdx = reIndex[std::make_pair(src,dst)];
+        TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
+        colTotalBandwidth[dst]  += r.avgBandwidthGbPerSec;
+        rowTotalBandwidth       += r.avgBandwidthGbPerSec;
+        totalBandwidthGpu       += r.avgBandwidthGbPerSec;
+        executorBandwidth        = std::max(executorBandwidth,
+                                            results.exeResults[transfers[transferIdx].exeDevice].avgBandwidthGbPerSec);
+        printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
+      } else {
+        printf("%c%8s  ", separator, "N/A");
+      }
+    }
+    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
+    minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
+    maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
+    colTotalBandwidth[numGpus] += rowTotalBandwidth;
+  }
+  printf("\nRTotal");
+  for (int dst = 0; dst < numGpus; dst++) {
+    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
+  }
+  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus],
+         separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
+  printf("\n");
+
+  printf("Average   bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
+  printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
+  printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+
+  PrintErrors(results.errResults);
+}
--- a/src/client/Presets/HealthCheck.hpp
+++ b/src/client/Presets/HealthCheck.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+void HealthCheckPreset(EnvVars&           ev,
+                       size_t      const  numBytesPerTransfer,
+                       std::string const  presetName)
+{
+  // Check for supported platforms
+#if defined(__NVCC__)
+  printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
+  return;
+#endif
+
+  bool hasFail = false;
+
+  // Force use of single stream
+  ev.useSingleStream = 1;
+
+  TransferBench::TestResults results;
+  int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  if (numGpuDevices != 8) {
+    printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
+    exit(1);
+  }
+
+  for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
+    std::string fullName = prop.gcnArchName;
+    std::string archName = fullName.substr(0, fullName.find(':'));
+    if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
+    {
+      printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
+      exit(1);
+    }
+  }
+
+  // Pass limits
+  double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
+  double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
+  double a2aLimit  = getenv("LIMIT_A2A")  ? atof(getenv("LIMIT_A2A"))  : (int)(45 * 0.95);
+
+  // Run CPU to GPU
+
+  // Run unidirectional read from CPU to GPU
+  printf("Testing unidirectional reads from CPU ");
+  {
+    ev.gfxUnroll = 4;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+
+      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+      if (memIndex == -1) {
+        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+        exit(1);
+      }
+
+      std::vector<Transfer> transfers(1);
+      Transfer& t = transfers[0];
+      t.exeDevice = {EXE_GPU_GFX, gpuId};
+      t.numBytes  = 64*1024*1024;
+      t.srcs      = {{MEM_CPU, memIndex}};
+      t.dsts      = {};
+
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t.numSubExecs = cu;
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
+        } else {
+          PrintErrors(results.errResults);
+        }
+        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
+          passed = true;
+          break;
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+      }
+    }
+  }
+
+  // Run unidirectional write from GPU to CPU
+  printf("Testing unidirectional writes to  CPU ");
+  {
+    ev.gfxUnroll = 4;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+
+      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+      if (memIndex == -1) {
+        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+        exit(1);
+      }
+
+      std::vector<Transfer> transfers(1);
+      Transfer& t = transfers[0];
+      t.exeDevice = {EXE_GPU_GFX, gpuId};
+      t.numBytes  = 64*1024*1024;
+      t.srcs      = {};
+      t.dsts      = {{MEM_CPU, memIndex}};
+
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t.numSubExecs = cu;
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
+        } else {
+          PrintErrors(results.errResults);
+        }
+        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
+          passed = true;
+          break;
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+      }
+    }
+  }
+
+  // Run bidirectional tests
+  printf("Testing bidirectional  reads + writes ");
+  {
+    ev.gfxUnroll = 4;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+
+      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+      if (memIndex == -1) {
+        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+        exit(1);
+      }
+
+      std::vector<Transfer> transfers(2);
+      Transfer& t0 = transfers[0];
+      Transfer& t1 = transfers[1];
+
+      t0.exeDevice = {EXE_GPU_GFX, gpuId};
+      t0.numBytes  = 64*1024*1024;
+      t0.srcs      = {{MEM_CPU, memIndex}};
+      t0.dsts      = {};
+
+      t1.exeDevice = {EXE_GPU_GFX, gpuId};
+      t1.numBytes  = 64*1024*1024;
+      t1.srcs      = {};
+      t1.dsts      = {{MEM_CPU, memIndex}};
+
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t0.numSubExecs = cu;
+        t1.numSubExecs = cu;
+
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+          double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
+                        results.tfrResults[1].avgBandwidthGbPerSec);
+          bestResult = std::max(bestResult, sum);
+          if (sum >= bdirLimit) {
+            passed = true;
+            break;
+          }
+        } else {
+          PrintErrors(results.errResults);
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
+      }
+    }
+  }
+
+  // Run XGMI tests:
+  printf("Testing all-to-all XGMI copies        "); fflush(stdout);
+  {
+    // Force GFX unroll to 2 for MI300
+    ev.gfxUnroll = 2;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+
+    std::vector<Transfer> transfers;
+    for (int i = 0; i < numGpuDevices; i++) {
+      for (int j = 0; j < numGpuDevices; j++) {
+        if (i == j) continue;
+        Transfer t;
+        t.numBytes    = 64*1024*1024;
+        t.numSubExecs = 8;
+        t.exeDevice   = {EXE_GPU_GFX, i};
+        t.srcs        = {{MEM_GPU_FINE, i}};
+        t.dsts        = {{MEM_GPU_FINE, j}};
+        transfers.push_back(t);
+      }
+    }
+    std::vector<std::pair<std::pair<int,int>, double>> fails;
+
+    if (TransferBench::RunTransfers(cfg, transfers, results)) {
+      int transferIdx = 0;
+      for (int i = 0; i < numGpuDevices; i++) {
+        printf("."); fflush(stdout);
+        for (int j = 0; j < numGpuDevices; j++) {
+          if (i == j) continue;
+          double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
+          if (bw < a2aLimit) {
+            fails.push_back(std::make_pair(std::make_pair(i,j), bw));
+          }
+          transferIdx++;
+        }
+      }
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
+      }
+    }
+  }
+  exit(hasFail ? 1 : 0);
+}
--- a/src/client/Presets/OneToAll.hpp
+++ b/src/client/Presets/OneToAll.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+void OneToAllPreset(EnvVars&           ev,
+                    size_t      const  numBytesPerTransfer,
+                    std::string const  presetName)
+{
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+  if (numDetectedGpus < 2) {
+    printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n");
+    exit(1);
+  }
+
+  // Collect env vars for this preset
+  int         numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int         numSubExecs   = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
+  int         exeIndex      = EnvVars::GetEnvVar("EXE_INDEX",   0);
+  int         sweepDir      = EnvVars::GetEnvVar("SWEEP_DIR",  0);
+  std::string sweepDst      = EnvVars::GetEnvVar("SWEEP_DST",  "G");
+  std::string sweepExe      = EnvVars::GetEnvVar("SWEEP_EXE",  "G");
+  std::string sweepSrc      = EnvVars::GetEnvVar("SWEEP_SRC",  "G");
+  int         sweepMin      = EnvVars::GetEnvVar("SWEEP_MIN",  1);
+  int         sweepMax      = EnvVars::GetEnvVar("SWEEP_MAX",  numGpuDevices);
+
+  // Display environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[One-To-All Related]\n");
+    ev.Print("NUM_GPU_DEVICES", numGpuDevices,    "Using %d GPUs", numGpuDevices);
+    ev.Print("NUM_GPU_SE",      numSubExecs,      "Using %d subExecutors/CUs per Transfer", numSubExecs);
+    ev.Print("EXE_INDEX",       exeIndex,         "Executing on GPU %d", exeIndex);
+    ev.Print("SWEEP_DIR",       sweepDir,         "Direction of transfer");
+    ev.Print("SWEEP_DST",       sweepDst.c_str(), "DST memory types to sweep");
+    ev.Print("SWEEP_EXE",       sweepExe.c_str(), "Executor type to use");
+    ev.Print("SWEEP_MAX",       sweepMax,         "Maximum number of peers");
+    ev.Print("SWEEP_MIN",       sweepMin,         "Minimum number of peers");
+    ev.Print("SWEEP_SRC",       sweepSrc.c_str(), "SRC memory types to sweep");
+    printf("\n");
+  }
+
+  // Perform validation
+  for (auto ch : sweepExe) {
+    if (ch != 'G' && ch != 'D') {
+      printf("[ERROR] Unrecognized executor type '%c' specified\n", ch);
+      exit(1);
+    }
+  }
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  char const sep = (ev.outputToCsv ? ',' : ' ');
+
+  for (char src : sweepSrc) for (char exe : sweepExe) for (char dst : sweepDst) {
+    // Skip invalid configurations
+    if ((exe == 'D' && (src == 'N' || dst == 'N')) || (src == 'N' && dst == 'N')) continue;
+
+    printf("Executing (%c%s -> %c%d -> %c%s)\n",
+           src, src == 'N' ? "" : (sweepDir == 0 ? std::to_string(exeIndex).c_str() : std::string("*").c_str()),
+           exe, exeIndex,
+           dst, dst == 'N' ? "" : sweepDir == 0 ? std::string("*").c_str() : std::to_string(exeIndex).c_str());
+
+    for (int i = 0; i < numGpuDevices; i++) {
+      if (i == exeIndex) continue;
+      printf("   GPU %-3d  %c", i, sep);
+    }
+    printf("\n");
+    if (!ev.outputToCsv) {
+      for (int i = 0; i < numGpuDevices-1; i++)
+        printf("-------------");
+      printf("\n");
+    }
+
+    for (int p = sweepMin; p <= sweepMax; p++) {
+      for (int bitmask = 0; bitmask < (1<<numGpuDevices); bitmask++) {
+        if (bitmask & (1<<exeIndex) || __builtin_popcount(bitmask) != p) continue;
+
+        std::vector<Transfer> transfers;
+        for (int i = 0; i < numGpuDevices; i++) {
+          if (bitmask & (1<<i)) {
+            Transfer t;
+            CheckForError(TransferBench::CharToExeType(exe, t.exeDevice.exeType));
+            t.exeDevice.exeIndex = exeIndex;
+            t.exeSubIndex = -1;
+            t.numSubExecs = numSubExecs;
+            t.numBytes    = numBytesPerTransfer;
+
+            if (src == 'N') {
+              t.srcs.clear();
+            } else {
+              t.srcs.resize(1);
+              CheckForError(TransferBench::CharToMemType(src, t.srcs[0].memType));
+              t.srcs[0].memIndex = sweepDir == 0 ? exeIndex : i;
+            }
+
+            if (dst == 'N') {
+              t.dsts.clear();
+            } else {
+              t.dsts.resize(1);
+              CheckForError(TransferBench::CharToMemType(dst, t.dsts[0].memType));
+              t.dsts[0].memIndex = sweepDir == 0 ? i : exeIndex;
+            }
+            transfers.push_back(t);
+          }
+        }
+        if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+          PrintErrors(results.errResults);
+          exit(1);
+        }
+
+        int counter = 0;
+        for (int i = 0; i < numGpuDevices; i++) {
+          if (bitmask & (1<<i))
+            printf("  %8.3f  %c", results.tfrResults[counter++].avgBandwidthGbPerSec, sep);
+          else if (i != exeIndex)
+            printf("            %c", sep);
+        }
+
+        printf(" %d %d", p, numSubExecs);
+        for (auto i = 0; i < transfers.size(); i++) {
+          printf(" (%s %c%d %s)",
+                 MemDevicesToStr(transfers[i].srcs).c_str(),
+                 ExeTypeStr[transfers[i].exeDevice.exeType], transfers[i].exeDevice.exeIndex,
+                 MemDevicesToStr(transfers[i].dsts).c_str());
+        }
+        printf("\n");
+      }
+    }
+  }
+}
--- a/src/client/Presets/PeerToPeer.hpp
+++ b/src/client/Presets/PeerToPeer.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+void PeerToPeerPreset(EnvVars&           ev,
+                      size_t      const  numBytesPerTransfer,
+                      std::string const  presetName)
+{
+  int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int useDmaCopy     = EnvVars::GetEnvVar("USE_GPU_DMA",     0);
+  int numCpuDevices  = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
+  int numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE",      4);
+  int numGpuDevices  = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numGpuSubExecs = EnvVars::GetEnvVar("NUM_GPU_SE",      useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}));
+  int p2pMode        = EnvVars::GetEnvVar("P2P_MODE",        0);
+  int useFineGrain   = EnvVars::GetEnvVar("USE_FINE_GRAIN",  0);
+  int useRemoteRead  = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+
+  // Display environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    int outputToCsv = ev.outputToCsv;
+    if (!outputToCsv) printf("[P2P Related]\n");
+    ev.Print("NUM_CPU_DEVICES", numCpuDevices,  "Using %d CPUs", numCpuDevices);
+    ev.Print("NUM_CPU_SE",      numCpuSubExecs, "Using %d CPU threads per Transfer", numCpuSubExecs);
+    ev.Print("NUM_GPU_DEVICES", numGpuDevices,  "Using %d GPUs", numGpuDevices);
+    ev.Print("NUM_GPU_SE",      numGpuSubExecs, "Using %d GPU subexecutors/CUs per Transfer", numGpuSubExecs);
+    ev.Print("P2P_MODE",        p2pMode,        "Running %s transfers", p2pMode == 0 ? "Uni + Bi" :
+                                                                        p2pMode == 1 ? "Unidirectional"
+                                                                                     : "Bidirectional");
+    ev.Print("USE_FINE_GRAIN",  useFineGrain,   "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    ev.Print("USE_GPU_DMA",     useDmaCopy,     "Using GPU-%s as GPU executor", useDmaCopy ? "DMA" : "GFX");
+    ev.Print("USE_REMOTE_READ", useRemoteRead,  "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+    printf("\n");
+  }
+
+  char const separator = ev.outputToCsv ? ',' : ' ';
+  printf("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer);
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  // Collect the number of available CPUs/GPUs on this machine
+  int const numDevices = numCpuDevices + numGpuDevices;
+
+  // Perform unidirectional / bidirectional
+  for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) {
+    if (p2pMode == 1 && isBidirectional == 1 ||
+        p2pMode == 2 && isBidirectional == 0) continue;
+
+    printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n", isBidirectional ? "Bi" : "Uni",
+           useRemoteRead ? "Remote" : "Local",
+           useRemoteRead ? "Local" : "Remote",
+           useDmaCopy    ? "DMA"   : "GFX");
+
+    // Print header
+    if (isBidirectional) {
+      printf("%12s", "SRC\\DST");
+    } else {
+      if (useRemoteRead)
+        printf("%12s", "SRC\\EXE+DST");
+      else
+        printf("%12s", "SRC+EXE\\DST");
+    }
+    if (ev.outputToCsv) printf(",");
+    for (int i = 0; i < numCpuDevices; i++) {
+      printf("%7s %02d", "CPU", i);
+      if (ev.outputToCsv) printf(",");
+    }
+    if (numCpuDevices > 0) printf("   ");
+    for (int i = 0; i < numGpuDevices; i++) {
+      printf("%7s %02d", "GPU", i);
+      if (ev.outputToCsv) printf(",");
+    }
+    printf("\n");
+
+    double avgBwSum[2][2] = {};
+    int    avgCount[2][2] = {};
+
+    ExeType const gpuExeType = useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+
+    // Loop over all possible src/dst pairs
+    for (int src = 0; src < numDevices; src++) {
+      MemType const srcType  = (src < numCpuDevices ? MEM_CPU : MEM_GPU);
+      int     const srcIndex = (srcType == MEM_CPU ? src : src - numCpuDevices);
+      MemType const srcTypeActual = ((useFineGrain && srcType == MEM_CPU) ? MEM_CPU_FINE :
+                                     (useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE :
+                                                                               srcType);
+      std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
+      std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
+      std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
+      std::vector<std::vector<double>> stdDev(isBidirectional + 1);
+
+      if (src == numCpuDevices && src != 0) printf("\n");
+      for (int dst = 0; dst < numDevices; dst++) {
+        MemType const dstType  = (dst < numCpuDevices ? MEM_CPU : MEM_GPU);
+        int     const dstIndex = (dstType == MEM_CPU ? dst : dst - numCpuDevices);
+        MemType const dstTypeActual = ((useFineGrain && dstType == MEM_CPU) ? MEM_CPU_FINE :
+                                       (useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE :
+                                                                                 dstType);
+        // Prepare Transfers
+        std::vector<Transfer> transfers(isBidirectional + 1);
+
+        // SRC -> DST
+        transfers[0].numBytes = numBytesPerTransfer;
+        transfers[0].srcs.push_back({srcTypeActual, srcIndex});
+        transfers[0].dsts.push_back({dstTypeActual, dstIndex});
+        transfers[0].exeDevice = {IsGpuMemType(useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU,
+                                  (useRemoteRead ? dstIndex : srcIndex)};
+        transfers[0].exeSubIndex = -1;
+        transfers[0].numSubExecs = (transfers[0].exeDevice.exeType == gpuExeType) ? numGpuSubExecs : numCpuSubExecs;
+
+        // DST -> SRC
+        if (isBidirectional) {
+          transfers[1].numBytes = numBytesPerTransfer;
+          transfers[1].srcs.push_back({dstTypeActual, dstIndex});
+          transfers[1].dsts.push_back({srcTypeActual, srcIndex});
+          transfers[1].exeDevice = {IsGpuMemType(useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU,
+                                    (useRemoteRead ? srcIndex : dstIndex)};
+          transfers[1].exeSubIndex = -1;
+          transfers[1].numSubExecs = (transfers[1].exeDevice.exeType == gpuExeType) ? numGpuSubExecs : numCpuSubExecs;
+        }
+
+        bool skipTest = false;
+
+        // Abort if executing on NUMA node with no CPUs
+        for (int i = 0; i <= isBidirectional; i++) {
+          if (transfers[i].exeDevice.exeType == EXE_CPU &&
+              TransferBench::GetNumSubExecutors(transfers[i].exeDevice) == 0) {
+            skipTest = true;
+            break;
+          }
+
+#if defined(__NVCC__)
+          // NVIDIA platform cannot access GPU memory directly from CPU executors
+          if (transfers[i].exeDevice.exeType == EXE_CPU && (IsGpuMemType(srcType) || IsGpuMemType(dstType))) {
+            skipTest = true;
+            break;
+          }
+#endif
+        }
+
+        if (isBidirectional && srcType == dstType && srcIndex == dstIndex) skipTest = true;
+
+        if (!skipTest) {
+          if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+            for (auto const& err : results.errResults)
+              printf("%s\n", err.errMsg.c_str());
+            exit(1);
+          }
+
+          for (int dir = 0; dir <= isBidirectional; dir++) {
+            double const avgBw   = results.tfrResults[dir].avgBandwidthGbPerSec;
+            avgBandwidth[dir].push_back(avgBw);
+
+            if (!(srcType == dstType && srcIndex == dstIndex)) {
+              avgBwSum[srcType][dstType] += avgBw;
+              avgCount[srcType][dstType]++;
+            }
+
+            if (ev.showIterations) {
+              double minTime = results.tfrResults[dir].perIterMsec[0];
+              double maxTime = minTime;
+              double varSum  = 0;
+              for (int i = 0; i < results.tfrResults[dir].perIterMsec.size(); i++) {
+                minTime = std::min(minTime, results.tfrResults[dir].perIterMsec[i]);
+                maxTime = std::max(maxTime, results.tfrResults[dir].perIterMsec[i]);
+                double const bw  = (transfers[dir].numBytes / 1.0E9) / results.tfrResults[dir].perIterMsec[i] * 1000.0f;
+                double const delta = (avgBw - bw);
+                varSum += delta * delta;
+              }
+              double const minBw = (transfers[dir].numBytes / 1.0E9) / maxTime * 1000.0f;
+              double const maxBw = (transfers[dir].numBytes / 1.0E9) / minTime * 1000.0f;
+              double const stdev = sqrt(varSum / results.tfrResults[dir].perIterMsec.size());
+              minBandwidth[dir].push_back(minBw);
+              maxBandwidth[dir].push_back(maxBw);
+              stdDev[dir].push_back(stdev);
+            }
+          }
+        } else {
+          for (int dir = 0; dir <= isBidirectional; dir++) {
+            avgBandwidth[dir].push_back(0);
+            minBandwidth[dir].push_back(0);
+            maxBandwidth[dir].push_back(0);
+            stdDev[dir].push_back(-1.0);
+          }
+        }
+      }
+
+      for (int dir = 0; dir <= isBidirectional; dir++) {
+        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, dir ? "<- " : " ->");
+        if (ev.outputToCsv) printf(",");
+
+        for (int dst = 0; dst < numDevices; dst++) {
+          if (dst == numCpuDevices && dst != 0) printf("   ");
+          double const avgBw = avgBandwidth[dir][dst];
+
+          if (avgBw == 0.0)
+            printf("%10s", "N/A");
+          else
+            printf("%10.2f", avgBw);
+          if (ev.outputToCsv) printf(",");
+        }
+        printf("\n");
+
+        if (ev.showIterations) {
+          // minBw
+          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
+          if (ev.outputToCsv) printf(",");
+          for (int i = 0; i < numDevices; i++) {
+            double const minBw = minBandwidth[dir][i];
+            if (i == numCpuDevices && i != 0) printf("   ");
+            if (minBw == 0.0)
+              printf("%10s", "N/A");
+            else
+              printf("%10.2f", minBw);
+            if (ev.outputToCsv) printf(",");
+          }
+          printf("\n");
+
+          // maxBw
+          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
+          if (ev.outputToCsv) printf(",");
+          for (int i = 0; i < numDevices; i++) {
+            double const maxBw = maxBandwidth[dir][i];
+            if (i == numCpuDevices && i != 0) printf("   ");
+            if (maxBw == 0.0)
+              printf("%10s", "N/A");
+            else
+              printf("%10.2f", maxBw);
+            if (ev.outputToCsv) printf(",");
+          }
+          printf("\n");
+
+          // stddev
+          printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
+          if (ev.outputToCsv) printf(",");
+          for (int i = 0; i < numDevices; i++) {
+            double const sd = stdDev[dir][i];
+            if (i == numCpuDevices && i != 0) printf("   ");
+            if (sd == -1.0)
+              printf("%10s", "N/A");
+            else
+              printf("%10.2f", sd);
+            if (ev.outputToCsv) printf(",");
+          }
+          printf("\n");
+        }
+        fflush(stdout);
+      }
+
+      if (isBidirectional) {
+        printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
+        if (ev.outputToCsv) printf(",");
+        for (int dst = 0; dst < numDevices; dst++) {
+          double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
+          if (dst == numCpuDevices && dst != 0) printf("   ");
+          if (sumBw == 0.0)
+            printf("%10s", "N/A");
+          else
+            printf("%10.2f", sumBw);
+          if (ev.outputToCsv) printf(",");
+        }
+        printf("\n");
+        if (src < numDevices - 1) printf("\n");
+      }
+    }
+
+    if (!ev.outputToCsv) {
+      printf("                         ");
+      for (int srcType : {MEM_CPU, MEM_GPU})
+        for (int dstType : {MEM_CPU, MEM_GPU})
+          printf("  %cPU->%cPU", srcType == MEM_CPU ? 'C' : 'G', dstType == MEM_CPU ? 'C' : 'G');
+      printf("\n");
+
+      printf("Averages (During %s):",  isBidirectional ? " BiDir" : "UniDir");
+      for (int srcType : {MEM_CPU, MEM_GPU})
+        for (int dstType : {MEM_CPU, MEM_GPU}) {
+          if (avgCount[srcType][dstType])
+            printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
+          else
+            printf("%10s", "N/A");
+        }
+      printf("\n\n");
+    }
+  }
+}
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+
+// Included after EnvVars and Executors
+#include "AllToAll.hpp"
+#include "HealthCheck.hpp"
+#include "OneToAll.hpp"
+#include "PeerToPeer.hpp"
+#include "Scaling.hpp"
+#include "Schmoo.hpp"
+#include "Sweep.hpp"
+#include <map>
+
+typedef void (*PresetFunc)(EnvVars&          ev,
+                           size_t      const numBytesPerTransfer,
+                           std::string const presetName);
+
+std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
+{
+  {"a2a",         {AllToAllPreset,   "Tests parallel transfers between all pairs of GPU devices"}},
+  {"healthcheck", {HealthCheckPreset,"Simple bandwidth health check (MI300X series only)"}},
+  {"one2all",     {OneToAllPreset,   "Test all subsets of parallel transfers from one GPU to all others"}},
+  {"p2p"   ,      {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}},
+  {"rsweep",      {SweepPreset,      "Randomly sweep through sets of Transfers"}},
+  {"scaling",     {ScalingPreset,    "Run scaling test from one GPU to other devices"}},
+  {"schmoo",      {SchmooPreset,     "Scaling tests for local/remote read/write/copy"}},
+  {"sweep",       {SweepPreset,      "Ordered sweep through sets of Transfers"}},
+};
+
+void DisplayPresets()
+{
+  printf("\nAvailable Preset Benchmarks:\n");
+  printf("============================\n");
+  for (auto const& x : presetFuncMap)
+    printf("   %15s - %s\n", x.first.c_str(), x.second.second.c_str());
+}
+
+int RunPreset(EnvVars&       ev,
+              size_t   const numBytesPerTransfer,
+              int      const argc,
+              char**   const argv)
+{
+  std::string preset = (argc > 1 ? argv[1] : "");
+  if (presetFuncMap.count(preset)) {
+    (presetFuncMap[preset].first)(ev, numBytesPerTransfer, preset);
+    return 1;
+  }
+  return 0;
+}
--- a/src/client/Presets/Scaling.hpp
+++ b/src/client/Presets/Scaling.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+void ScalingPreset(EnvVars&           ev,
+                   size_t      const  numBytesPerTransfer,
+                   std::string const  presetName)
+{
+  int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int localIdx      = EnvVars::GetEnvVar("LOCAL_IDX",       0);
+  int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
+  int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int sweepMax      = EnvVars::GetEnvVar("SWEEP_MAX",       32);
+  int sweepMin      = EnvVars::GetEnvVar("SWEEP_MIN",       1);
+  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN",  0);
+
+  // Display environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    int outputToCsv = ev.outputToCsv;
+    if (!outputToCsv) printf("[Schmoo Related]\n");
+    ev.Print("LOCAL_IDX",      localIdx,     "Local GPU index");
+    ev.Print("SWEEP_MAX",      sweepMax,     "Max number of subExecutors to use");
+    ev.Print("SWEEP_MIN",      sweepMin,     "Min number of subExecutors to use");
+    printf("\n");
+  }
+
+  // Validate env vars
+  if (localIdx >= numDetectedGpus) {
+    printf("[ERROR] Cannot execute scaling test with local GPU device %d\n", localIdx);
+    exit(1);
+  }
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  char separator = (ev.outputToCsv ? ',' : ' ');
+  int numDevices = numCpuDevices + numGpuDevices;
+
+  printf("GPU-GFX Scaling benchmark:\n");
+  printf("==========================\n");
+  printf("- Copying %lu bytes from GPU %d to other devices\n", numBytesPerTransfer, localIdx);
+  printf("- All numbers reported as GB/sec\n\n");
+  printf("NumCUs");
+  for (int i = 0; i < numDevices; i++)
+    printf("%c  %s%02d     ", separator, i < numCpuDevices ? "CPU" : "GPU", i < numCpuDevices ? i : i - numCpuDevices);
+  printf("\n");
+
+  std::vector<std::pair<double, int>> bestResult(numDevices);
+
+  std::vector<Transfer> transfers(1);
+  Transfer& t   = transfers[0];
+  t.exeDevice   = {EXE_GPU_GFX, localIdx};
+  t.exeSubIndex = -1;
+  t.numBytes    = numBytesPerTransfer;
+  t.srcs        = {{MEM_GPU, localIdx}};
+
+  for (int numSubExec = sweepMin; numSubExec <= sweepMax; numSubExec++) {
+    t.numSubExecs = numSubExec;
+    printf("%4d  ", numSubExec);
+
+    for (int i = 0; i < numDevices; i++) {
+      t.dsts = {{i < numCpuDevices ? MEM_CPU : MEM_GPU,
+                 i < numCpuDevices ? i : i - numCpuDevices}};
+      if (!RunTransfers(cfg, transfers, results)) {
+        PrintErrors(results.errResults);
+        exit(1);
+      }
+      double bw = results.tfrResults[0].avgBandwidthGbPerSec;
+      printf("%c%7.2f     ", separator, bw);
+
+      if (bw > bestResult[i].first) {
+        bestResult[i].first  = bw;
+        bestResult[i].second = numSubExec;
+      }
+    }
+    printf("\n");
+  }
+
+  printf(" Best ");
+  for (int i = 0; i < numDevices; i++)
+    printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
+  printf("\n");
+}
--- a/src/client/Presets/Schmoo.hpp
+++ b/src/client/Presets/Schmoo.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+void SchmooPreset(EnvVars&           ev,
+                  size_t      const  numBytesPerTransfer,
+                  std::string const  presetName)
+{
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  if (numDetectedGpus < 2) {
+    printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
+    exit(1);
+  }
+
+  // Collect env vars for this preset
+  int localIdx     = EnvVars::GetEnvVar("LOCAL_IDX",      0);
+  int remoteIdx    = EnvVars::GetEnvVar("REMOTE_IDX",     1);
+  int sweepMax     = EnvVars::GetEnvVar("SWEEP_MAX",      32);
+  int sweepMin     = EnvVars::GetEnvVar("SWEEP_MIN",      1);
+  int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
+
+  // Display environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    int outputToCsv = ev.outputToCsv;
+    if (!outputToCsv) printf("[Schmoo Related]\n");
+    ev.Print("LOCAL_IDX",      localIdx,     "Local GPU index");
+    ev.Print("REMOTE_IDX",     remoteIdx,    "Remote GPU index");
+    ev.Print("SWEEP_MAX",      sweepMax,     "Max number of subExecutors to use");
+    ev.Print("SWEEP_MIN",      sweepMin,     "Min number of subExecutors to use");
+    ev.Print("USE_FINE_GRAIN", useFineGrain, "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    printf("\n");
+  }
+
+  // Validate env vars
+  if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) {
+    printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
+    exit(1);
+  }
+
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+
+  char memChar = useFineGrain ? 'F' : 'G';
+  printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
+  printf("       | Local Read  | Local Write | Local Copy  | Remote Read | Remote Write| Remote Copy |\n");
+  printf("  #CUs |%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
+         memChar, localIdx, localIdx,
+         localIdx, memChar, localIdx,
+         memChar, localIdx, localIdx, memChar, localIdx,
+         memChar, remoteIdx, localIdx,
+         localIdx, memChar, remoteIdx,
+         memChar, localIdx, localIdx, memChar, remoteIdx);
+  printf("|------|-------------|-------------|-------------|-------------|-------------|-------------|\n");
+
+  std::vector<Transfer> transfers(1);
+  Transfer& t   = transfers[0];
+  t.exeDevice   = {EXE_GPU_GFX, localIdx};
+  t.exeSubIndex = -1;
+  t.numBytes    = numBytesPerTransfer;
+
+  MemType memType = (useFineGrain ? MEM_GPU_FINE : MEM_GPU);
+
+  for (int numCUs = sweepMin; numCUs <= sweepMax; numCUs++) {
+    t.numSubExecs = numCUs;
+
+    // Local Read
+    t.srcs = {{memType, localIdx}};
+    t.dsts = {};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const localRead = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Local Write
+    t.srcs = {};
+    t.dsts = {{memType, localIdx}};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Local Copy
+    t.srcs = {{memType, localIdx}};
+    t.dsts = {{memType, localIdx}};
+    t.srcs = {};
+    t.dsts = {{memType, localIdx}};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Remote Read
+    t.srcs = {{memType, remoteIdx}};
+    t.dsts = {};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Remote Write
+    t.srcs = {};
+    t.dsts = {{memType, remoteIdx}};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    // Remote Copy
+    t.srcs = {{memType, localIdx}};
+    t.dsts = {{memType, remoteIdx}};
+    if (!RunTransfers(cfg, transfers, results)) {
+      PrintErrors(results.errResults);
+      exit(1);
+    }
+    double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec;
+
+    printf("   %3d   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f  \n",
+           numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
+  }
+}