Header-only TransferBench library refactor (#134)

9658305f · gilbertlee-amd · GitHub · b56d4817 · 9658305f · 9658305f
Unverified Commit 9658305f authored Nov 21, 2024 by gilbertlee-amd Committed by GitHub Nov 21, 2024
20 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ _static/
 _templates/
 _toc.yml
 docBin/
+TransferBench
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,30 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
+## v1.54
+### Modified
+- Refactored TransferBench into a header-only library combined with a thin client to facilitate the
+  use of TransferBench as the backend for other applications
+- Optimized how data validation is handled - this should speed up Tests with many parallel transfers as data is only
+  generated once
+- Preset benchmarks now no longer take in any extra command line arguments.  Preset settings are only accessed via
+  environment variables.  Details for each preset are printed
+- The a2a preset benchmark now defaults to using fine-grained memory and GFX unroll of 2
+- Refactored how Transfers are launched in parallel which has reduced some CPU-side overheads
+- CPU and DMA executor timing now use CPU wall clock timing instead of slowest Transfer time
+### Added
+- New one2all preset which sweeps over all subests of parallel transfers from one GPU to others
+- Adding new warnings for DMA execution relating to how HIP will default to using agents from the source memory
+### Removed
+- CU scaling preset has been removed.  Similar functionality already exists in the schmoo preset benchmark
+- Preparation of source data via GFX kernel has been removed (USE_PREP_KERNEL)
+- Removed GFX block-reordering (BLOCK_ORDER)
+- Removed NUM_CPU_DEVICES and NUM_GPU_DEVICES from common env vars and only into the presets they apply to.
+- Removed SHARED_MEM_BYTES option for GFX executor
+- Removed USE_PCIE_INDEX, and SHARED_MEM_BYTES
+### Fixed
+- Fixed a potential timing reporting issue when DMA executed Transfers end up getting serialized.
 ## v1.53
 ### Added
 - Added ability to specify NULL for sweep preset as source or destination memory type

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 # Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
 if (DEFINED ENV{ROCM_PATH})
    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
 else()
@@ -6,13 +7,13 @@ else()
 endif()
 cmake_minimum_required(VERSION 3.5)
-project(TransferBench VERSION 1.51.0 LANGUAGES CXX)
+project(TransferBench VERSION 1.54.0 LANGUAGES CXX)
-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 --std=c++20 -L${ROCM_PATH}/lib")
 include_directories(${ROCM_PATH}/include)
 link_libraries(numa hsa-runtime64 pthread)
-set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ..)
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
-add_executable(TransferBench src/TransferBench.cpp)
+add_executable(TransferBench src/client/Client.cpp)
-target_include_directories(TransferBench PRIVATE src/include)
+target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)
 find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
 include(ROCMInstallTargets)

--- a/Makefile
+++ b/Makefile
 #
-# Copyright (c) 2023      Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
 #
-all:
+# Configuration options
-	cd src ; make
+ROCM_PATH ?= /opt/rocm
+CUDA_PATH ?= /usr/local/cuda
+HIPCC=$(ROCM_PATH)/bin/hipcc
+NVCC=$(CUDA_PATH)/bin/nvcc
+# Compile TransferBenchCuda if nvcc detected
+ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
+	EXE=TransferBenchCuda
+else
+	EXE=TransferBench
+endif
+CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
+NVFLAGS  = -x cu -lnuma -arch=native
+COMMON_FLAGS = -O3 --std=c++20 -I./src/header -I./src/client -I./src/client/Presets
+LDFLAGS += -lpthread
+all: $(EXE)
+TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
+	$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
+TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
+	$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
 clean:
-	cd src ; make clean
+	rm -f *.o ./TransferBench ./TransferBenchCuda
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -8,8 +8,8 @@ import re
 from rocm_docs import ROCmDocs
-with open('../src/include/EnvVars.hpp', encoding='utf-8') as f:
+with open('../src/header/TransferBench.hpp', encoding='utf-8') as f:
-    match = re.search(r'#define TB_VERSION "([0-9.]+)[^0-9.]+', f.read())
+    match = re.search(r'constexpr char VERSION\[\] = "([0-9.]+)[^0-9.]+', f.read())
    if not match:
        raise ValueError("VERSION not found!")
    version_number = match[1]
@@ -18,7 +18,7 @@ left_nav_title = f"TransferBench {version_number} Documentation"
 # for PDF output on Read the Docs
 project = "TransferBench Documentation"
 author = "Advanced Micro Devices, Inc."
-copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
+copyright = "Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved."
 version = version_number
 release = version_number

--- a/docs/install/install.rst
+++ b/docs/install/install.rst
@@ -47,7 +47,7 @@ To build documentation locally, use:
 .. code-block:: bash
  cd docs
-  pip3 install -r .sphinx/requirements.txt
+  pip3 install -r ./sphinx/requirements.txt
  python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
 NVIDIA platform support

--- a/src/Makefile
+++ b/src/Makefile
-# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
-ROCM_PATH ?= /opt/rocm
-CUDA_PATH ?= /usr/local/cuda
-HIPCC=$(ROCM_PATH)/bin/hipcc
-NVCC=$(CUDA_PATH)/bin/nvcc
-# Compile TransferBenchCuda if nvcc detected
-ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
-	EXE=../TransferBenchCuda
-else
-	EXE=../TransferBench
-endif
-CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
-NVFLAGS = -O3 -Iinclude -x cu -lnuma -arch=native
-LDFLAGS    += -lpthread
-all: $(EXE)
-../TransferBench: TransferBench.cpp $(shell find -regex ".*\.\hpp")
-	$(HIPCC) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
-../TransferBenchCuda: TransferBench.cpp $(shell find -regex ".*\.\hpp")
-	$(NVCC) $(NVFLAGS) $< -o $@ $(LDFLAGS)
-clean:
-	rm -f *.o ../TransferBench ../TransferBenchCuda
--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
--- a/src/client/Client.cpp
+++ b/src/client/Client.cpp
+/*
+Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include "Client.hpp"
+#include "Presets.hpp"
+#include "Topology.hpp"
+#include <fstream>
+int main(int argc, char **argv) {
+  // Collect environment variables
+  EnvVars ev;
+  // Display usage instructions and detected topology
+  if (argc <= 1) {
+    if (!ev.outputToCsv) {
+      DisplayUsage(argv[0]);
+      DisplayPresets();
+    }
+    DisplayTopology(ev.outputToCsv);
+    exit(0);
+  }
+  // Determine number of bytes to run per Transfer
+  size_t numBytesPerTransfer = argc > 2 ? atoll(argv[2]) : DEFAULT_BYTES_PER_TRANSFER;
+  if (argc > 2) {
+    // Adjust bytes if unit specified
+    char units = argv[2][strlen(argv[2])-1];
+    switch (units) {
+    case 'G': case 'g': numBytesPerTransfer *= 1024;
+    case 'M': case 'm': numBytesPerTransfer *= 1024;
+    case 'K': case 'k': numBytesPerTransfer *= 1024;
+    }
+  }
+  if (numBytesPerTransfer % 4) {
+    printf("[ERROR] numBytesPerTransfer (%lu) must be a multiple of 4\n", numBytesPerTransfer);
+    exit(1);
+  }
+  // Run preset benchmark if requested
+  if (RunPreset(ev, numBytesPerTransfer, argc, argv)) exit(0);
+  // Read input from command line or configuration file
+  std::vector<std::string> lines;
+  {
+    std::string line;
+    if (!strcmp(argv[1], "cmdline")) {
+      for (int i = 3; i < argc; i++)
+        line += std::string(argv[i]) + " ";
+      lines.push_back(line);
+    } else {
+      std::ifstream cfgFile(argv[1]);
+      if (!cfgFile.is_open()) {
+        printf("[ERROR] Unable to open transfer configuration file: [%s]\n", argv[1]);
+        exit(1);
+      }
+      while (std::getline(cfgFile, line))
+        lines.push_back(line);
+      cfgFile.close();
+    }
+  }
+  // Print environment variables and CSV header
+  ev.DisplayEnvVars();
+  if (ev.outputToCsv)
+    printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
+  TransferBench::ConfigOptions cfgOptions = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  std::vector<ErrResult> errors;
+  // Process each line as a Test
+  int testNum = 0;
+  for (std::string const &line : lines) {
+    // Check if line is a comment to be echoed to output (starts with ##)
+    if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s\n", line.c_str());
+    // Parse set of parallel Transfers to execute
+    std::vector<Transfer> transfers;
+    CheckForError(TransferBench::ParseTransfers(line, transfers));
+    if (transfers.empty()) continue;
+    // Check for variable sub-executors Transfers
+    int numVariableTransfers = 0;
+    int maxVarCount = 0;
+    {
+      std::map<ExeDevice, int> varTransferCount;
+      for (auto const& t : transfers) {
+        if (t.numSubExecs == 0) {
+          if (t.exeDevice.exeType != EXE_GPU_GFX) {
+            printf("[ERROR] Variable number of subexecutors is only supported on GFX executors\n");
+            exit(1);
+          }
+          numVariableTransfers++;
+          varTransferCount[t.exeDevice]++;
+          maxVarCount = max(maxVarCount, varTransferCount[t.exeDevice]);
+        }
+      }
+      if (numVariableTransfers > 0 && numVariableTransfers != transfers.size()) {
+        printf("[ERROR] All or none of the Transfers in the Test must use variable number of Subexecutors\n");
+        exit(1);
+      }
+    }
+    // Run the specified numbers of bytes otherwise generate a range of values
+    for (size_t bytes = (1<<10); bytes <= (1<<29); bytes *= 2) {
+      size_t deltaBytes = std::max(1UL, bytes / ev.samplingFactor);
+      size_t currBytes = (numBytesPerTransfer == 0) ? bytes : numBytesPerTransfer;
+      do {
+        for (auto& t : transfers)
+          t.numBytes = currBytes;
+        if (maxVarCount == 0) {
+          if (TransferBench::RunTransfers(cfgOptions, transfers, results)) {
+            PrintResults(ev, ++testNum, transfers, results);
+          }
+          PrintErrors(results.errResults);
+        } else {
+          // Variable subexecutors - Determine how many subexecutors to sweep up to
+          int maxNumVarSubExec = ev.maxNumVarSubExec;
+          if (maxNumVarSubExec == 0) {
+            maxNumVarSubExec = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}) / maxVarCount;
+          }
+          TransferBench::TestResults bestResults;
+          std::vector<Transfer> bestTransfers;
+          for (int numSubExecs = ev.minNumVarSubExec; numSubExecs <= maxNumVarSubExec; numSubExecs++) {
+            std::vector<Transfer> tempTransfers = transfers;
+            for (auto& t : tempTransfers) {
+              if (t.numSubExecs == 0) t.numSubExecs = numSubExecs;
+            }
+            TransferBench::TestResults tempResults;
+            if (!TransferBench::RunTransfers(cfgOptions, tempTransfers, tempResults)) {
+              PrintErrors(tempResults.errResults);
+            } else {
+              if (tempResults.avgTotalBandwidthGbPerSec > bestResults.avgTotalBandwidthGbPerSec) {
+                bestResults = tempResults;
+                bestTransfers = tempTransfers;
+              }
+            }
+          }
+          PrintResults(ev, ++testNum, bestTransfers, bestResults);
+          PrintErrors(bestResults.errResults);
+        }
+        if (numBytesPerTransfer != 0) break;
+        currBytes += deltaBytes;
+      } while (currBytes < bytes * 2);
+      if (numBytesPerTransfer != 0) break;
+    }
+  }
+}
+void DisplayUsage(char const* cmdName)
+{
+  printf("TransferBench Client v%s (Backend v%s)\n", CLIENT_VERSION, TransferBench::VERSION);
+  printf("========================================\n");
+  if (numa_available() == -1)
+  {
+    printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
+    exit(1);
+  }
+  printf("Usage: %s config <N>\n", cmdName);
+  printf("  config: Either:\n");
+  printf("          - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
+  printf("          - Name of preset config:\n");
+  printf("  N     : (Optional) Number of bytes to copy per Transfer.\n");
+  printf("          If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
+         DEFAULT_BYTES_PER_TRANSFER);
+  printf("          If 0 is specified, a range of Ns will be benchmarked\n");
+  printf("          May append a suffix ('K', 'M', 'G') for kilobytes / megabytes / gigabytes\n");
+  printf("\n");
+  EnvVars::DisplayUsage();
+}
+std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices) {
+  if (memDevices.empty()) return "N";
+  std::stringstream ss;
+  for (auto const& m : memDevices)
+    ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
+  return ss.str();
+}
+void PrintResults(EnvVars const& ev, int const testNum,
+                  std::vector<Transfer> const& transfers,
+                  TransferBench::TestResults const& results)
+{
+  char sep = ev.outputToCsv ? ',' : '|';
+  size_t numTimedIterations = results.numTimedIterations;
+  if (!ev.outputToCsv) printf("Test %d:\n", testNum);
+  // Loop over each executor
+  for (auto exeInfoPair : results.exeResults) {
+    ExeDevice const& exeDevice = exeInfoPair.first;
+    ExeResult const& exeResult = exeInfoPair.second;
+    ExeType const    exeType   = exeDevice.exeType;
+    int32_t const    exeIndex  = exeDevice.exeIndex;
+    printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
+           ExeTypeName[exeType], exeIndex, sep, exeResult.avgBandwidthGbPerSec, sep,
+           exeResult.avgDurationMsec, sep, exeResult.numBytes, sep, exeResult.sumBandwidthGbPerSec);
+    // Loop over each executor
+    for (int idx : exeResult.transferIdx) {
+      Transfer const& t = transfers[idx];
+      TransferResult const& r = results.tfrResults[idx];
+      char exeSubIndexStr[32] = "";
+      if (t.exeSubIndex != -1)
+        sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
+      printf("     Transfer %02d  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
+             idx,                    sep,
+             r.avgBandwidthGbPerSec, sep,
+             r.avgDurationMsec,      sep,
+             r.numBytes,             sep,
+             MemDevicesToStr(t.srcs).c_str(), ExeTypeName[exeType], exeIndex,
+             exeSubIndexStr, t.numSubExecs, MemDevicesToStr(t.dsts).c_str());
+      // Show per-iteration timing information
+      if (ev.showIterations) {
+        // Check that per-iteration information exists
+        if (r.perIterMsec.size() != numTimedIterations) {
+          printf("[ERROR] Per iteration timing data unavailable: Expected %lu data points, but have %lu\n",
+                 numTimedIterations, r.perIterMsec.size());
+          exit(1);
+        }
+        // Compute standard deviation and track iterations by speed
+        std::set<std::pair<double, int>> times;
+        double stdDevTime = 0;
+        double stdDevBw = 0;
+        for (int i = 0; i < numTimedIterations; i++) {
+          times.insert(std::make_pair(r.perIterMsec[i], i+1));
+          double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
+          stdDevTime += varTime * varTime;
+          double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
+          double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
+          stdDevBw += varBw * varBw;
+        }
+        stdDevTime = sqrt(stdDevTime / numTimedIterations);
+        stdDevBw = sqrt(stdDevBw / numTimedIterations);
+        // Loop over iterations (fastest to slowest)
+        for (auto& time : times) {
+          double iterDurationMsec = time.first;
+          double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
+          printf("      Iter %03d    %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
+          std::set<int> usedXccs;
+          if (time.second - 1 < r.perIterCUs.size()) {
+            printf(" CUs:");
+            for (auto x : r.perIterCUs[time.second - 1]) {
+              printf(" %02d:%02d", x.first, x.second);
+              usedXccs.insert(x.first);
+            }
+          }
+          printf(" XCCs:");
+          for (auto x : usedXccs)
+            printf(" %02d", x);
+          printf("\n");
+        }
+        printf("      StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
+      }
+    }
+  }
+  printf(" Aggregate (CPU)  %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n",
+         sep, results.avgTotalBandwidthGbPerSec,
+         sep, results.avgTotalDurationMsec,
+         sep, results.totalBytesTransferred,
+         sep, results.overheadMsec);
+}
+void CheckForError(ErrResult const& error)
+{
+  switch (error.errType) {
+  case ERR_NONE: return;
+  case ERR_WARN:
+    printf("[WARN] %s\n", error.errMsg.c_str());
+    return;
+  case ERR_FATAL:
+    printf("[ERROR] %s\n", error.errMsg.c_str());
+    exit(1);
+  default:
+    break;
+  }
+}
+void PrintErrors(std::vector<ErrResult> const& errors)
+{
+  bool isFatal = false;
+  for (auto const& err : errors) {
+    printf("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
+    isFatal |= (err.errType == ERR_FATAL);
+  }
+  if (isFatal) exit(1);
+}
--- a/src/client/Client.hpp
+++ b/src/client/Client.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+// TransferBench client version
+#define CLIENT_VERSION "1.54.00"
+#include "TransferBench.hpp"
+#include "EnvVars.hpp"
+size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);
+char const ExeTypeName[4][4] = {"CPU", "GPU", "DMA", "IBV"};
+// Display detected hardware
+void DisplayTopology(bool outputToCsv);
+// Display usage instructions
+void DisplayUsage(char const* cmdName);
+// Print TransferBench test results
+void PrintResults(EnvVars const& ev, int const testNum,
+                  std::vector<Transfer> const& transfers,
+                  TransferBench::TestResults const& results);
+// Helper function that converts MemDevices to a string
+std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices);
+// Helper function to print warning / exit on fatal error
+void CheckForError(ErrResult const& error);
+// Helper function to print list of errors
+void PrintErrors(std::vector<ErrResult> const& errors);
--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
--- a/src/client/Presets/AllToAll.hpp
+++ b/src/client/Presets/AllToAll.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#include "EnvVars.hpp"
+void AllToAllPreset(EnvVars&           ev,
+                    size_t      const  numBytesPerTransfer,
+                    std::string const  presetName)
+{
+  enum
+  {
+    A2A_COPY       = 0,
+    A2A_READ_ONLY  = 1,
+    A2A_WRITE_ONLY = 2
+  };
+  char a2aModeStr[3][20] = {"Copy", "Read-Only", "Write-Only"};
+  // Force single-stream mode for all-to-all benchmark
+  ev.useSingleStream = 1;
+  // Force to gfx unroll 2 unless explicitly set
+  ev.gfxUnroll      = EnvVars::GetEnvVar("GFX_UNROLL", 2);
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+  // Collect env vars for this preset
+  int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT"     , 1);
+  int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
+  int a2aMode       = EnvVars::GetEnvVar("A2A_MODE"       , 0);
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC"   , 8);
+  int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC"   , 0);
+  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
+  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+  // Print off environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[AllToAll Related]\n");
+    ev.Print("A2A_DIRECT"     , a2aDirect    , a2aDirect ? "Only using direct links" : "Full all-to-all");
+    ev.Print("A2A_LOCAL"      , a2aLocal     , "%s local transfers", a2aLocal ? "Include" : "Exclude");
+    ev.Print("A2A_MODE"       , a2aMode      , a2aModeStr[a2aMode]);
+    ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
+    ev.Print("NUM_SUB_EXEC"   , numSubExecs  , "Using %d subexecutors/CUs per Transfer", numSubExecs);
+    ev.Print("USE_DMA_EXEC"   , useDmaExec   , "Using %s executor", useDmaExec ? "DMA" : "GFX");
+    ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    ev.Print("USE_REMOTE_READ", useRemoteRead, "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+    printf("\n");
+  }
+  // Validate env vars
+  if (a2aMode < 0 || a2aMode > 2) {
+    printf("[ERROR] a2aMode must be between 0 and 2\n");
+    exit(1);
+  }
+  if (numGpus < 0 || numGpus > numDetectedGpus) {
+    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    exit(1);
+  }
+  // Collect the number of GPU devices to use
+  int const numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
+  int const numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
+  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+  ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
+  std::map<std::pair<int, int>, int> reIndex;
+  std::vector<Transfer> transfers;
+  for (int i = 0; i < numGpus; i++) {
+    for (int j = 0; j < numGpus; j++) {
+      // Check whether or not to execute this pair
+      if (i == j) {
+        if (!a2aLocal) continue;
+      } else if (a2aDirect) {
+#if !defined(__NVCC__)
+        uint32_t linkType, hopCount;
+        HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+        if (hopCount != 1) continue;
+#endif
+      }
+      // Build Transfer and add it to list
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      if (numSrcs) transfer.srcs.push_back({memType, i});
+      if (numDsts) transfer.dsts.push_back({memType, j});
+      transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
+      transfer.exeSubIndex = -1;
+      transfer.numSubExecs = numSubExecs;
+      reIndex[std::make_pair(i,j)] = transfers.size();
+      transfers.push_back(transfer);
+    }
+  }
+  printf("GPU-GFX All-To-All benchmark:\n");
+  printf("==========================\n");
+  printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
+         numBytesPerTransfer, a2aDirect ? "directly connected" : "all", numSubExecs, transfers.size());
+  if (transfers.size() == 0) return;
+  // Execute Transfers
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+    for (auto const& err : results.errResults)
+      printf("%s\n", err.errMsg.c_str());
+    exit(0);
+  } else {
+    PrintResults(ev, 1, transfers, results);
+  }
+  // Print results
+  char separator = (ev.outputToCsv ? ',' : ' ');
+  printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
+  printf("==========================================================\n");
+  printf("SRC\\DST ");
+  for (int dst = 0; dst < numGpus; dst++)
+    printf("%cGPU %02d    ", separator, dst);
+  printf("   %cSTotal     %cActual\n", separator, separator);
+  double totalBandwidthGpu = 0.0;
+  double minExecutorBandwidth = std::numeric_limits<double>::max();
+  double maxExecutorBandwidth = 0.0;
+  std::vector<double> colTotalBandwidth(numGpus+1, 0.0);
+  for (int src = 0; src < numGpus; src++) {
+    double rowTotalBandwidth = 0;
+    double executorBandwidth = 0;
+    printf("GPU %02d", src);
+    for (int dst = 0; dst < numGpus; dst++) {
+      if (reIndex.count(std::make_pair(src, dst))) {
+        int const transferIdx = reIndex[std::make_pair(src,dst)];
+        TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
+        colTotalBandwidth[dst]  += r.avgBandwidthGbPerSec;
+        rowTotalBandwidth       += r.avgBandwidthGbPerSec;
+        totalBandwidthGpu       += r.avgBandwidthGbPerSec;
+        executorBandwidth        = std::max(executorBandwidth,
+                                            results.exeResults[transfers[transferIdx].exeDevice].avgBandwidthGbPerSec);
+        printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
+      } else {
+        printf("%c%8s  ", separator, "N/A");
+      }
+    }
+    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, executorBandwidth);
+    minExecutorBandwidth = std::min(minExecutorBandwidth, executorBandwidth);
+    maxExecutorBandwidth = std::max(maxExecutorBandwidth, executorBandwidth);
+    colTotalBandwidth[numGpus] += rowTotalBandwidth;
+  }
+  printf("\nRTotal");
+  for (int dst = 0; dst < numGpus; dst++) {
+    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
+  }
+  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus],
+         separator, minExecutorBandwidth, separator, maxExecutorBandwidth);
+  printf("\n");
+  printf("Average   bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
+  printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
+  printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+  PrintErrors(results.errResults);
+}
--- a/src/client/Presets/HealthCheck.hpp
+++ b/src/client/Presets/HealthCheck.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+void HealthCheckPreset(EnvVars&           ev,
+                       size_t      const  numBytesPerTransfer,
+                       std::string const  presetName)
+{
+  // Check for supported platforms
+#if defined(__NVCC__)
+  printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
+  return;
+#endif
+  bool hasFail = false;
+  // Force use of single stream
+  ev.useSingleStream = 1;
+  TransferBench::TestResults results;
+  int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+  if (numGpuDevices != 8) {
+    printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
+    exit(1);
+  }
+  for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+    hipDeviceProp_t prop;
+    HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
+    std::string fullName = prop.gcnArchName;
+    std::string archName = fullName.substr(0, fullName.find(':'));
+    if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
+    {
+      printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
+      exit(1);
+    }
+  }
+  // Pass limits
+  double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
+  double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
+  double a2aLimit  = getenv("LIMIT_A2A")  ? atof(getenv("LIMIT_A2A"))  : (int)(45 * 0.95);
+  // Run CPU to GPU
+  // Run unidirectional read from CPU to GPU
+  printf("Testing unidirectional reads from CPU ");
+  {
+    ev.gfxUnroll = 4;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+      if (memIndex == -1) {
+        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+        exit(1);
+      }
+      std::vector<Transfer> transfers(1);
+      Transfer& t = transfers[0];
+      t.exeDevice = {EXE_GPU_GFX, gpuId};
+      t.numBytes  = 64*1024*1024;
+      t.srcs      = {{MEM_CPU, memIndex}};
+      t.dsts      = {};
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t.numSubExecs = cu;
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
+        } else {
+          PrintErrors(results.errResults);
+        }
+        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
+          passed = true;
+          break;
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+      }
+    }
+  }
+  // Run unidirectional write from GPU to CPU
+  printf("Testing unidirectional writes to  CPU ");
+  {
+    ev.gfxUnroll = 4;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+      if (memIndex == -1) {
+        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+        exit(1);
+      }
+      std::vector<Transfer> transfers(1);
+      Transfer& t = transfers[0];
+      t.exeDevice = {EXE_GPU_GFX, gpuId};
+      t.numBytes  = 64*1024*1024;
+      t.srcs      = {};
+      t.dsts      = {{MEM_CPU, memIndex}};
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t.numSubExecs = cu;
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
+        } else {
+          PrintErrors(results.errResults);
+        }
+        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
+          passed = true;
+          break;
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+      }
+    }
+  }
+  // Run bidirectional tests
+  printf("Testing bidirectional  reads + writes ");
+  {
+    ev.gfxUnroll = 4;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    std::vector<std::pair<int, double>> fails;
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      printf("."); fflush(stdout);
+      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+      if (memIndex == -1) {
+        printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+        exit(1);
+      }
+      std::vector<Transfer> transfers(2);
+      Transfer& t0 = transfers[0];
+      Transfer& t1 = transfers[1];
+      t0.exeDevice = {EXE_GPU_GFX, gpuId};
+      t0.numBytes  = 64*1024*1024;
+      t0.srcs      = {{MEM_CPU, memIndex}};
+      t0.dsts      = {};
+      t1.exeDevice = {EXE_GPU_GFX, gpuId};
+      t1.numBytes  = 64*1024*1024;
+      t1.srcs      = {};
+      t1.dsts      = {{MEM_CPU, memIndex}};
+      // Loop over number of CUs to use
+      bool passed = false;
+      double bestResult = 0;
+      for (int cu = 7; cu <= 10; cu++) {
+        t0.numSubExecs = cu;
+        t1.numSubExecs = cu;
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+          double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
+                        results.tfrResults[1].avgBandwidthGbPerSec);
+          bestResult = std::max(bestResult, sum);
+          if (sum >= bdirLimit) {
+            passed = true;
+            break;
+          }
+        } else {
+          PrintErrors(results.errResults);
+        }
+      }
+      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
+      }
+    }
+  }
+  // Run XGMI tests:
+  printf("Testing all-to-all XGMI copies        "); fflush(stdout);
+  {
+    // Force GFX unroll to 2 for MI300
+    ev.gfxUnroll = 2;
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    std::vector<Transfer> transfers;
+    for (int i = 0; i < numGpuDevices; i++) {
+      for (int j = 0; j < numGpuDevices; j++) {
+        if (i == j) continue;
+        Transfer t;
+        t.numBytes    = 64*1024*1024;
+        t.numSubExecs = 8;
+        t.exeDevice   = {EXE_GPU_GFX, i};
+        t.srcs        = {{MEM_GPU_FINE, i}};
+        t.dsts        = {{MEM_GPU_FINE, j}};
+        transfers.push_back(t);
+      }
+    }
+    std::vector<std::pair<std::pair<int,int>, double>> fails;
+    if (TransferBench::RunTransfers(cfg, transfers, results)) {
+      int transferIdx = 0;
+      for (int i = 0; i < numGpuDevices; i++) {
+        printf("."); fflush(stdout);
+        for (int j = 0; j < numGpuDevices; j++) {
+          if (i == j) continue;
+          double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
+          if (bw < a2aLimit) {
+            fails.push_back(std::make_pair(std::make_pair(i,j), bw));
+          }
+          transferIdx++;
+        }
+      }
+    }
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = true;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
+      }
+    }
+  }
+  exit(hasFail ? 1 : 0);
+}
--- a/src/client/Presets/OneToAll.hpp
+++ b/src/client/Presets/OneToAll.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+void OneToAllPreset(EnvVars&           ev,
+                    size_t      const  numBytesPerTransfer,
+                    std::string const  presetName)
+{
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+  if (numDetectedGpus < 2) {
+    printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n");
+    exit(1);
+  }
+  // Collect env vars for this preset
+  int         numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int         numSubExecs   = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
+  int         exeIndex      = EnvVars::GetEnvVar("EXE_INDEX",   0);
+  int         sweepDir      = EnvVars::GetEnvVar("SWEEP_DIR",  0);
+  std::string sweepDst      = EnvVars::GetEnvVar("SWEEP_DST",  "G");
+  std::string sweepExe      = EnvVars::GetEnvVar("SWEEP_EXE",  "G");
+  std::string sweepSrc      = EnvVars::GetEnvVar("SWEEP_SRC",  "G");
+  int         sweepMin      = EnvVars::GetEnvVar("SWEEP_MIN",  1);
+  int         sweepMax      = EnvVars::GetEnvVar("SWEEP_MAX",  numGpuDevices);
+  // Display environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[One-To-All Related]\n");
+    ev.Print("NUM_GPU_DEVICES", numGpuDevices,    "Using %d GPUs", numGpuDevices);
+    ev.Print("NUM_GPU_SE",      numSubExecs,      "Using %d subExecutors/CUs per Transfer", numSubExecs);
+    ev.Print("EXE_INDEX",       exeIndex,         "Executing on GPU %d", exeIndex);
+    ev.Print("SWEEP_DIR",       sweepDir,         "Direction of transfer");
+    ev.Print("SWEEP_DST",       sweepDst.c_str(), "DST memory types to sweep");
+    ev.Print("SWEEP_EXE",       sweepExe.c_str(), "Executor type to use");
+    ev.Print("SWEEP_MAX",       sweepMax,         "Maximum number of peers");
+    ev.Print("SWEEP_MIN",       sweepMin,         "Minimum number of peers");
+    ev.Print("SWEEP_SRC",       sweepSrc.c_str(), "SRC memory types to sweep");
+    printf("\n");
+  }
+  // Perform validation
+  for (auto ch : sweepExe) {
+    if (ch != 'G' && ch != 'D') {
+      printf("[ERROR] Unrecognized executor type '%c' specified\n", ch);
+      exit(1);
+    }
+  }
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  char const sep = (ev.outputToCsv ? ',' : ' ');
+  for (char src : sweepSrc) for (char exe : sweepExe) for (char dst : sweepDst) {
+    // Skip invalid configurations
+    if ((exe == 'D' && (src == 'N' || dst == 'N')) || (src == 'N' && dst == 'N')) continue;
+    printf("Executing (%c%s -> %c%d -> %c%s)\n",
+           src, src == 'N' ? "" : (sweepDir == 0 ? std::to_string(exeIndex).c_str() : std::string("*").c_str()),
+           exe, exeIndex,
+           dst, dst == 'N' ? "" : sweepDir == 0 ? std::string("*").c_str() : std::to_string(exeIndex).c_str());
+    for (int i = 0; i < numGpuDevices; i++) {
+      if (i == exeIndex) continue;
+      printf("   GPU %-3d  %c", i, sep);
+    }
+    printf("\n");
+    if (!ev.outputToCsv) {
+      for (int i = 0; i < numGpuDevices-1; i++)
+        printf("-------------");
+      printf("\n");
+    }
+    for (int p = sweepMin; p <= sweepMax; p++) {
+      for (int bitmask = 0; bitmask < (1<<numGpuDevices); bitmask++) {
+        if (bitmask & (1<<exeIndex) || __builtin_popcount(bitmask) != p) continue;
+        std::vector<Transfer> transfers;
+        for (int i = 0; i < numGpuDevices; i++) {
+          if (bitmask & (1<<i)) {
+            Transfer t;
+            CheckForError(TransferBench::CharToExeType(exe, t.exeDevice.exeType));
+            t.exeDevice.exeIndex = exeIndex;
+            t.exeSubIndex = -1;
+            t.numSubExecs = numSubExecs;
+            t.numBytes    = numBytesPerTransfer;
+            if (src == 'N') {
+              t.srcs.clear();
+            } else {
+              t.srcs.resize(1);
+              CheckForError(TransferBench::CharToMemType(src, t.srcs[0].memType));
+              t.srcs[0].memIndex = sweepDir == 0 ? exeIndex : i;
+            }
+            if (dst == 'N') {
+              t.dsts.clear();
+            } else {
+              t.dsts.resize(1);
+              CheckForError(TransferBench::CharToMemType(dst, t.dsts[0].memType));
+              t.dsts[0].memIndex = sweepDir == 0 ? i : exeIndex;
+            }
+            transfers.push_back(t);
+          }
+        }
+        if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+          PrintErrors(results.errResults);
+          exit(1);
+        }
+        int counter = 0;
+        for (int i = 0; i < numGpuDevices; i++) {
+          if (bitmask & (1<<i))
+            printf("  %8.3f  %c", results.tfrResults[counter++].avgBandwidthGbPerSec, sep);
+          else if (i != exeIndex)
+            printf("            %c", sep);
+        }
+        printf(" %d %d", p, numSubExecs);
+        for (auto i = 0; i < transfers.size(); i++) {
+          printf(" (%s %c%d %s)",
+                 MemDevicesToStr(transfers[i].srcs).c_str(),
+                 ExeTypeStr[transfers[i].exeDevice.exeType], transfers[i].exeDevice.exeIndex,
+                 MemDevicesToStr(transfers[i].dsts).c_str());
+        }
+        printf("\n");
+      }
+    }
+  }
+}
--- a/src/client/Presets/PeerToPeer.hpp
+++ b/src/client/Presets/PeerToPeer.hpp
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
--- a/src/client/Presets/Schmoo.hpp
+++ b/src/client/Presets/Schmoo.hpp
--- a/src/client/Presets/Sweep.hpp
+++ b/src/client/Presets/Sweep.hpp
--- a/src/client/Topology.hpp
+++ b/src/client/Topology.hpp
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp