impr: Library/Client build organization

Change code organization and build options Code changes related to the following: * Build files * Options to build client, shared, and static libraries * Source code directories * Modern C++20 changes * Based on TB 1.6.4 * Formatting Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>

impr: Library/Client build organization
Change code organization and build options Code changes related to the following: * Build files * Options to build client, shared, and static libraries * Source code directories * Modern C++20 changes * Based on TB 1.6.4 * Formatting Signed-off-by: Oliveira, Daniel <daniel.oliveira@amd.com>
09f4f11b · Oliveira, Daniel · 2d0ecaae · 09f4f11b · 09f4f11b · 09f4f11b
Commit 09f4f11b authored Jul 15, 2025 by Oliveira, Daniel
20 changed files
--- a/.clang-format
+++ b/.clang-format
+---
+##  Refer to the following link for the explanation of each params:
+#   See options here:
+#       https://clang.llvm.org/docs/ClangFormatStyleOptions.html
+#   Last update: Tue Jun 28 06:26:41 PM CDT 2022 (daniel.oliveira@amd.com / dmitrii.galantsev@amd.com)
+#
+Language: Cpp
+# BasedOnStyle: Google
+AccessModifierOffset: 4
+IndentAccessModifiers: true
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands: true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: true
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: true
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+# This is deprecated
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments:  false
+BinPackParameters: false
+InsertBraces: true
+#BreakBeforeBraces: Attach
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterClass:            true
+  AfterControlStatement: false
+  AfterEnum:             true
+  AfterFunction:         true
+  AfterNamespace:        true
+  AfterObjCDeclaration:  false
+  AfterStruct:           true
+  AfterUnion:            true
+  AfterExternBlock:      true
+  BeforeCatch:           false
+  BeforeElse:            false
+  IndentBraces:          false
+  # disabling the below splits, else, they'll just add to the vertical length of source files!
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+BreakBeforeBinaryOperators: None
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+# Kept the below 4 to be the same as `IndentWidth` to keep everything uniform
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+PointerAlignment: Left
+DisableFormat: false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+UseTab: Never
+# Be consistent with indent-width, even for people who use tab for indentation!
+TabWidth: 4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 4
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2100
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+  - Language: TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle: google
+# Enabling comment reflow causes doxygen comments to be messed up in their formats!
+ReflowComments: true
+SortIncludes: true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 4
+SpacesInAngles: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: Cpp11
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+##
--- a/.clang-tidy
+++ b/.clang-tidy
+# Good defaults
+# Checks:
+#   -*, bugprone-*,cert-dcl21-cpp,cert-dcl50-cpp,cert-env33-c,cert-err34-c,cert-err52-cpp,cert-err60-cpp,cert-flp30-c,cert-msc50-cpp,cert-msc51-cpp,cppcoreguidelines-*,-cppcoreguidelines-macro-usage,-cppcoreguidelines-pro-type-reinterpret-cast,-cppcoreguidelines-pro-type-union-access,-cppcoreguidelines-pro-bounds-pointer-arithmetic,-cppcoreguidelines-pro-type-vararg,google-build-using-namespace,google-explicit-constructor,google-global-names-in-headers,google-readability-casting,google-runtime-int,google-runtime-operator,hicpp-*,-hicpp-vararg,misc-*,modernize-*,performance-*,readability-*,-readability-named-parameter
+Checks:
+  -*,
+  bugprone-*,
+  clang-analyzer*,
+  google-*,
+  hicpp-*
+  misc-*,
+  modernize-*,
+  performance-*
+  readability-*
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,40 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
+## v1.64.00
+### Added
+- Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes
+- Added FILL_COMPRESS to allow more control over input data pattern
+  - FILL_COMPRESS takes in a comma-separated list of integer percentages (that must add up to 100)
+    that sets the percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0 data patterns
+    - Bins:
+      - 0 - random
+      - 1 - 1B0    upper 1 byte of each aligned 2 bytes is 0
+      - 2 - 2B0    upper 2 bytes of each aligned 4 bytes is 0
+      - 3 - 4B0    upper 4 bytes of each aligned 8 bytes is 0
+      - 4 - 32B0   upper 32 bytes of each aligned 64-byte line are 0
+  - FILL_PATTERN will be ignored if FILL_COMPRESS is specified
+- Additional details about data patterns generated will be printed if the debug env var DUMP_LINES is
+  set to a non-zero value, which also corresponds to how many 64 byte lines will be printed
+### Modified
+- Increased GFX_BLOCKSIZE limit from 512 to 1024 (still requires multiple of 64)
+### Fixed
+- Fixed bug when using BYTE_OFFSET
+## v1.63.00
+### Added
+- Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds
+### Modified
+- Removing self-GPU check for DMA engine copies
+- Switched to amdclang++ as primary compiler
+- healthcheck preset adds HBM testing and support for more MI3XX variants
+### Fixed
+- Fixed issue when using "P" memory type and specific DMA subengines
+- Fixed issue with subiteration timing reports
 ## v1.62.00
 ### Added
 - Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
-# Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
+# MIT License
+#
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+#
-if (DEFINED ENV{ROCM_PATH})
+#
-    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
+cmake_minimum_required(VERSION 3.25)
-else()
-    set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory")
+#
-endif()
+#   Find git and set the GIT_EXECUTABLE variable (good practice, used by submodules)
-cmake_minimum_required(VERSION 3.5)
+find_package(Git QUIET)
+if(NOT GIT_FOUND)
-project(TransferBench VERSION 1.62.00 LANGUAGES CXX)
+    message(FATAL_ERROR ">> Git is required to build this project. 'git' not found! ")
-# Default GPU architectures to build
-#==================================================================================================
-set(DEFAULT_GPUS
-      gfx906
-      gfx908
-      gfx90a
-      gfx942
-      gfx1030
-      gfx1100
-      gfx1101
-      gfx1102
-      gfx1200
-      gfx1201)
-# Build only for local GPU architecture
-if (BUILD_LOCAL_GPU_TARGET_ONLY)
-  message(STATUS "Building only for local GPU target")
-  if (COMMAND rocm_local_targets)
-    rocm_local_targets(DEFAULT_GPUS)
-  else()
-    message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
-  endif()
 endif()
+set(CMAKE_GIT_EXECUTABLE ${GIT_EXECUTABLE} CACHE FILEPATH "Path to the Git executable.")
+#
+#   Flag to enable / disable verbose output.
+set(CMAKE_VERBOSE_MAKEFILE ON)
+#
+#   Export compile commands for linters and auto-completers
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+#
+#   C++ standard settings
+set(CMAKE_CXX_STANDARD 20 CACHE STRING "C++ Standard in use")
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_INCLUDE_DIRECTORIES_BEFORE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+#
+#   Project specific directories
+set(AMD_PROJECT_BASE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} CACHE STRING "Base directory for this project" FORCE)
+set(AMD_PROJECT_CMAKE_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake)
+set(AMD_PROJECT_CMAKE_MODULES_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake/modules)
+list(APPEND CMAKE_MODULE_PATH "${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}")
+#
+#   Build options
+option(TRANSFERBENCH_ENGINE_HEADER_ONLY           "Make TB engine header-only available (interface target)"     OFF)
+option(TRANSFERBENCH_ENGINE_STATIC                "Build TB static library"                                     ON)
+option(TRANSFERBENCH_ENGINE_SHARED                "Build TB shared library"                                     OFF)
+option(TRANSFERBENCH_CLIENT                       "Build TransferBench client"                                  ON)
+option(TRANSFERBENCH_TREAT_WARNINGS_AS_ERRORS     "Treat default warnings as errors"                            ON)
+option(TRANSFERBENCH_COMPRESS_DEBUG_INFO          "Compressed debug information"                                ON)
+option(TRANSFERBENCH_LOCAL_GPU_TARGET_ONLY        "Build only for GPUs detected on this machine"                OFF)
+option(TRANSFERBENCH_ENABLE_NIC_EXEC              "Enable RDMA NIC Executor in TransferBench"                   OFF)
+option(TRANSFERBENCH_HARDENING_ENABLED            "Build the project with hardening flags"                      ON)
+option(TRANSFERBENCH_LINKER_TRY_MOLD              "Give preference to 'Mold' linker (faster) if available"      ON)
+option(TRANSFERBENCH_ENABLE_CPPCHECK_WARNINGS     "CppCheck static analysis warnings (for Developers)"          ON)
-# Determine which GPU architectures to build for
-set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")
-# Check if clang compiler can offload to GPU_TARGETS
+#
-if (COMMAND rocm_check_target_ids)
+#   Setup build utils
-  message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
+include(${AMD_PROJECT_CMAKE_DIRECTORY}/build_utils.cmake)
-  rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
+#
+#   CMake project info
+set(AMD_PROJECT_NAME "TransferBench")
+set(AMD_PROJECT_PACKAGE_NAME "transferbench")
+set(AMD_PROJECT_LIBRARY_NAME "amd_tb")
+set(AMD_PROJECT_DESCRIPTION "TransferBench utility")
+set(AMD_PROJECT_AUTHOR_ORGANIZATION "AMD, Inc.")
+set(AMD_PROJECT_GITHUB_REPO   "https://github.com/ROCm/TransferBench")
+set(AMD_PROJECT_AUTHOR_DOMAIN "https://www.amd.com")
+set(AMD_PROJECT_VERSION_MAJOR "")
+set(AMD_PROJECT_VERSION_MINOR "")
+set(AMD_PROJECT_VERSION_PATCH "")
+#
+#   Specify name of project to build, and validate requirements
+setup_build_version(PROJECT_TARGET_VERSION PROJECT_TARGET_VERSION_TEXT)
+set_variable_in_parent(PROJECT_TARGET_BINARY_VERSION ${PROJECT_TARGET_VERSION})
+setup_cmake(AMD_PROJECT_NAME PROJECT_TARGET_BINARY_VERSION)
+if(PROJECT_TARGET_VERSION AND ((AMD_PROJECT_VERSION_MAJOR STREQUAL "") OR (AMD_PROJECT_VERSION_MINOR STREQUAL "") OR (AMD_PROJECT_VERSION_PATCH STREQUAL "")))
+    string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\1" AMD_PROJECT_VERSION_MAJOR ${PROJECT_TARGET_VERSION})
+    string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\2" AMD_PROJECT_VERSION_MINOR ${PROJECT_TARGET_VERSION})
+    string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\3" AMD_PROJECT_VERSION_PATCH ${PROJECT_TARGET_VERSION})
 else()
-  message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs.")
+    message(FATAL_ERROR ">> 'PROJECT_TARGET_VERSION' was not properly set!")
-  set(SUPPORTED_GPUS ${DEFAULT_GPUS})
 endif()
+set(AMD_PROJECT_VERSION_FULL "${AMD_PROJECT_VERSION_MAJOR}.${AMD_PROJECT_VERSION_MINOR}.${AMD_PROJECT_VERSION_PATCH}")
+#
+#   Project information
+project(${AMD_PROJECT_NAME}
+    VERSION ${PROJECT_TARGET_VERSION_TEXT}
+    LANGUAGES CXX
+    DESCRIPTION ${AMD_PROJECT_DESCRIPTION}
+    HOMEPAGE_URL ${AMD_PROJECT_GITHUB_REPO}
+)
-set(COMPILING_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.")
+if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
-message(STATUS "Compiling for ${COMPILING_TARGETS}")
+    set(TRANSFERBENCH_IS_TOP_LEVEL TRUE)
-foreach(target ${COMPILING_TARGETS})
- list(APPEND static_link_flags --offload-arch=${target})
-endforeach()
-list(JOIN static_link_flags " " flags_str)
-set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
-include_directories(${ROCM_PATH}/include)
-find_library(IBVERBS_LIBRARY ibverbs)
-find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
-if (DEFINED ENV{DISABLE_NIC_EXEC})
-  message(STATUS "Disabling NIC Executor support")
-elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
-  message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
-  add_definitions(-DNIC_EXEC_ENABLED)
-  link_libraries(ibverbs)
 else()
-  if (NOT IBVERBS_LIBRARY)
+    set(TRANSFERBENCH_IS_TOP_LEVEL FALSE)
-    message(WARNING "IBVerbs library not found")
-  elseif (NOT IBVERBS_INCLUDE_DIR)
-    message(WARNING "infiniband/verbs.h not found")
-  endif()
-  message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
 endif()
-link_libraries(numa hsa-runtime64 pthread)
-set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
-add_executable(TransferBench src/client/Client.cpp)
-target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)
-find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
+#
-include(ROCMInstallTargets)
+#   Set the ROCm base path
-include(ROCMCreatePackage)
+setup_rocm_requirements()
-set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF)
+add_build_definitions()
+#
+#   Build info
+message(STATUS "[[ Building Project: " ${AMD_PROJECT_NAME} " v." ${PROJECT_TARGET_VERSION_TEXT} " { Build: " ${CMAKE_BUILD_TYPE} "} ]] ...")
-set(PACKAGE_NAME TB)
-set(LIBRARY_NAME TransferBench)
-rocm_install(TARGETS TransferBench COMPONENT devel)
+#
+# --- Auto Dependent Build options ---
+include(CMakeDependentOption)
+cmake_dependent_option(
+    AUTO_BUILD_STATIC_FOR_TB_CLIENT
+    "TRANSFERBENCH_ENGINE_STATIC build auto-enabled for TB client"
+    ON
+    "TRANSFERBENCH_CLIENT AND NOT (TRANSFERBENCH_ENGINE_HEADER_ONLY OR TRANSFERBENCH_ENGINE_SHARED OR TRANSFERBENCH_ENGINE_STATIC)"
+    OFF
+)
-rocm_package_add_dependencies(DEPENDS numactl hsa-rocr)
+if(AUTO_BUILD_STATIC_FOR_TB_CLIENT)
+    set(TRANSFERBENCH_ENGINE_STATIC ON CACHE BOOL "Build TB static library" FORCE)
+endif()
-rocm_create_package(
-    NAME ${LIBRARY_NAME}
+cmake_dependent_option(
-    DESCRIPTION "TransferBench package"
+    AUTO_BUILD_TB_LIBRARY
-    MAINTAINER "RCCL Team <gilbert.lee@amd.com>"
+    "Auto-enabled if any TB library option is set"
+    ON
+    "TRANSFERBENCH_ENGINE_HEADER_ONLY OR TRANSFERBENCH_ENGINE_SHARED OR TRANSFERBENCH_ENGINE_STATIC"
+    OFF
 )
+if(AUTO_BUILD_TB_LIBRARY)
+    set(TRANSFERBENCH_ENGINE_REQUIRED ON CACHE BOOL "TB engine is required" FORCE)
+endif()
+#   Validate conflicting build options
+if(TRANSFERBENCH_CLIENT AND TRANSFERBENCH_ENGINE_HEADER_ONLY AND (TRANSFERBENCH_ENGINE_SHARED OR TRANSFERBENCH_ENGINE_STATIC))
+    message(FATAL_ERROR ">> Conflicting build options: CLIENT cannot be built with: HEADER_ONLY and STATIC or SHARED! ")
+endif()
+# ---
+if(TRANSFERBENCH_ENGINE_REQUIRED)
+    add_subdirectory(deps/tbengine)
+endif()
+if(TRANSFERBENCH_CLIENT)
+    add_subdirectory(client)
+endif()
+#
+#  Packaging
+if(TRANSFERBENCH_ENGINE_REQUIRED OR TRANSFERBENCH_CLIENT)
+    ##rocm_install(TARGETS ${AMD_PROJECT_NAME} COMPONENT devel)
+    rocm_package_add_dependencies(DEPENDS "numactl" "hsa-rocr")
+    rocm_setup_version(VERSION ${VERSION_STRING})
+    set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
+    set(CPACK_RPM_PACKAGE_LICENSE "MIT")
+    rocm_create_package(
+        NAME ${AMD_PROJECT_PACKAGE_NAME}
+        DESCRIPTION "TransferBench package"
+        MAINTAINER "RCCL Team <gilbert.lee@amd.com>"
+    )
+endif()
+## End of CMakeLists.txt
--- a/Makefile
+++ b/Makefile
@@ -6,57 +6,81 @@
 ROCM_PATH ?= /opt/rocm
 CUDA_PATH ?= /usr/local/cuda
-HIPCC=$(ROCM_PATH)/bin/hipcc
+HIPCC ?= $(ROCM_PATH)/bin/amdclang++
-NVCC=$(CUDA_PATH)/bin/nvcc
+NVCC ?= $(CUDA_PATH)/bin/nvcc
-# Compile TransferBenchCuda if nvcc detected
+# This can be a space separated string of multiple GPU targets
-ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
+# Default is the native GPU target
-  EXE=TransferBenchCuda
+GPU_TARGETS ?= native
-  CXX=$(NVCC)
-else
-  EXE=TransferBench
-  CXX=$(HIPCC)
-endif
-CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
+DEBUG ?= 0
-NVFLAGS  = -x cu -lnuma -arch=native
-COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
+ifeq ($(filter clean,$(MAKECMDGOALS)),)
-LDFLAGS += -lpthread
+  # Compile TransferBenchCuda if nvidia-smi returns successfully and nvcc detected
+  ifeq ("$(shell nvidia-smi > /dev/null 2>&1 && test -e $(NVCC) && echo found)", "found")
-# Compile RDMA executor if
+    EXE=TransferBenchCuda
-# 1) DISABLE_NIC_EXEC is not set to 1
+    CXX=$(NVCC)
-# 2) IBVerbs is found in the Dynamic Linker cache
-# 3) infiniband/verbs.h is found in the default include path
-NIC_ENABLED = 0
-ifneq ($(DISABLE_NIC_EXEC),1)
-  ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
-    $(info lib IBVerbs not found)
-  else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
-    $(info infiniband/verbs.h not found)
  else
-    LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
+    EXE=TransferBench
-    NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
+    ifeq ("$(shell test -e $(HIPCC) && echo found)", "found")
-    NIC_ENABLED = 1
+      CXX=$(HIPCC)
+    else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
+      CXX=$(ROCM_PATH)/bin/hipcc
+      $(warning "Could not find $(HIPCC). Using fallback to $(CXX)")
+    else
+      $(error "Could not find $(HIPCC) or $(ROCM_PATH)/bin/hipcc. Check if the path is correct if you want to build $(EXE)")
+    endif
+    GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
  endif
-  ifeq ($(NIC_ENABLED), 0)
-    $(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
+  CXXFLAGS = -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
+  HIPLDFLAGS= -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64
+  HIPFLAGS = -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
+  NVFLAGS  = -x cu -lnuma -arch=native
+  ifeq ($(DEBUG), 0)
+    COMMON_FLAGS += -O3
+  else
+    COMMON_FLAGS += -O0 -g -ggdb3
+  endif
+  COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets
+  LDFLAGS += -lpthread
+  NIC_ENABLED = 0
+  # Compile RDMA executor if
+  # 1) DISABLE_NIC_EXEC is not set to 1
+  # 2) IBVerbs is found in the Dynamic Linker cache
+  # 3) infiniband/verbs.h is found in the default include path
+  DISABLE_NIC_EXEC ?= 0
+  ifneq ($(DISABLE_NIC_EXEC),1)
+    ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
+      $(info lib IBVerbs not found)
+    else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
+      $(info infiniband/verbs.h not found)
+    else
+      CXXFLAGS += -DNIC_EXEC_ENABLED
+      LDFLAGS += -libverbs
+      NIC_ENABLED = 1
+    endif
+    ifeq ($(NIC_ENABLED), 0)
+      $(info Building without NIC executor support)
+      $(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
+    else
+      $(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
+    endif
  endif
 endif
+.PHONY : all clean
 all: $(EXE)
-TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
+TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
-	$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) $(COMMON_FLAGS) $< -o $@ $(HIPLDFLAGS) $(LDFLAGS)
-TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
+TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
 	$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
 clean:
-	rm -f *.o ./TransferBench ./TransferBenchCuda
+	rm -f ./TransferBench ./TransferBenchCuda
-NicStatus:
-  ifeq ($(NIC_ENABLED), 1)
-		$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
-  else
-		$(info Building without NIC executor support)
-  endif
--- a/VERSION
+++ b/VERSION
+1.64.0
--- a/client/CMakeLists.txt
+++ b/client/CMakeLists.txt
+# MIT License
+#
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+#
+#
+cmake_minimum_required(VERSION 3.25)
+# Load CMake modules
+#==================================================================================================
+set(AMD_PROJECT_CMAKE_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake)
+set(AMD_PROJECT_CMAKE_MODULES_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/cmake/modules)
+list(APPEND CMAKE_MODULE_PATH "${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}")
+# CMake Toolchain file to define compilers and path to ROCm
+#==================================================================================================
+if (NOT CMAKE_TOOLCHAIN_FILE)
+    set(CMAKE_TOOLCHAIN_FILE "${AMD_PROJECT_CMAKE_DIRECTORY}/rocm_clang_toolchain.cmake")
+    message(STATUS ">> CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
+endif()
+#
+#   Build options
+option(TRANSFERBENCH_CLIENT_USE_ENGINE_STATIC     "TransferBench client links with 'static' library (default)"  ON)
+option(TRANSFERBENCH_CLIENT_USE_ENGINE_SHARED     "TransferBench client links with 'shared' library"            OFF)
+option(TRANSFERBENCH_CLIENT_USE_HEADER_ONLY       "TransferBench uses 'header-only' interface"                  OFF)
+#
+include(${AMD_PROJECT_CMAKE_DIRECTORY}/build_utils.cmake)               # setup_default_compiler_flags
+include(${AMD_PROJECT_CMAKE_MODULES_DIRECTORY}/Dependencies.cmake)      # rocm-cmake, rocm_local_targets
+#
+set (TRANSFERBENCH_TBENGINE_DIRECTORY ${AMD_PROJECT_BASE_DIRECTORY}/deps/tbengine)
+set (TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY ${TRANSFERBENCH_TBENGINE_DIRECTORY}/include)
+set (TRANSFERBENCH_CLIENT_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+set (TRANSFERBENCH_CLIENT_INCLUDE_DIRECTORY ${TRANSFERBENCH_CLIENT_DIRECTORY}/include)
+set (TRANSFERBENCH_CLIENT_PRESETS_INCLUDE_DIRECTORY ${TRANSFERBENCH_CLIENT_INCLUDE_DIRECTORY}/Presets)
+set (TRANSFERBENCH_CLIENT_SRC_DIRECTORY ${TRANSFERBENCH_CLIENT_DIRECTORY}/src)
+#
+#   Library names
+set(AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_header")
+set(TRANSFERBENCH_INTERFACE_TARGET_NAME "${AMD_PROJECT_PACKAGE_NAME}_engine")
+set(TRANSFERBENCH_INTERFACE_TARGET_NAME_ALIAS "${AMD_PROJECT_PACKAGE_NAME}::engine")
+set(AMD_PROJECT_STATIC_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_static")
+set(AMD_PROJECT_SHARED_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_shared")
+set(AMD_PROJECT_OBJECT_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_object_library")
+set(AMD_PROJECT_HIP_OBJECT_LIBRARY_NAME "${AMD_PROJECT_LIBRARY_NAME}_hip_object_library")
+set(AMD_PROJECT_CLIENT_NAME "${AMD_PROJECT_NAME}")
+#
+#   Subproject build information
+setup_build_version(TRANSFERBENCH_CLIENT_TARGET_VERSION TRANSFERBENCH_CLIENT_TARGET_VERSION_TEXT)
+set_variable_in_parent(TRANSFERBENCH_CLIENT_TARGET_BINARY_VERSION ${TRANSFERBENCH_CLIENT_TARGET_VERSION})
+file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/include/)
+configure_file(
+    ${AMD_PROJECT_CMAKE_DIRECTORY}/tbclient_version.hpp.in
+    ${CMAKE_BINARY_DIR}/include/tbclient_version.hpp
+    @ONLY
+)
+#
+#   Header/Source files
+set(TRANSFERBENCH_CLIENT_SOURCES
+    ${TRANSFERBENCH_CLIENT_SRC_DIRECTORY}/Client.cpp
+)
+set(TRANSFERBENCH_COMMON_INCLUDES
+    ${TRANSFERBENCH_CLIENT_INCLUDE_DIRECTORY}
+    ${TRANSFERBENCH_CLIENT_PRESETS_INCLUDE_DIRECTORY}
+)
+#
+message(STATUS ">> Building TransferBench 'client' ...")
+developer_status_message("DEVEL" "  >> TRANSFERBENCH_CLIENT_TARGET_VERSION: '${TRANSFERBENCH_CLIENT_TARGET_VERSION}' ")
+add_executable(${AMD_PROJECT_CLIENT_NAME} ${TRANSFERBENCH_CLIENT_SOURCES})
+set_target_properties(${AMD_PROJECT_CLIENT_NAME}
+    PROPERTIES
+        LINKER_LANGUAGE CXX
+)
+target_link_libraries(${AMD_PROJECT_CLIENT_NAME}
+    PRIVATE
+        ${TRANSFERBENCH_INTERFACE_TARGET_NAME_ALIAS}
+)
+target_include_directories(${AMD_PROJECT_CLIENT_NAME}
+    PRIVATE
+        ${TRANSFERBENCH_COMMON_INCLUDES}
+        ${CMAKE_BINARY_DIR}/include/
+        ${TRANSFERBENCH_TBENGINE_INCLUDE_DIRECTORY}
+)
+setup_default_compiler_flags(${AMD_PROJECT_CLIENT_NAME})
+#
+#   Linking the appropriate TransferBench library
+#   Priority: Static > Shared > Header-Only
+set(WAS_TB_LINKED OFF)
+if(TRANSFERBENCH_ENGINE_STATIC)
+    message(STATUS "  >> Client build with 'static' library: ${AMD_PROJECT_STATIC_LIBRARY_NAME} ...")
+    target_link_libraries(${AMD_PROJECT_CLIENT_NAME}
+        PRIVATE
+            ${AMD_PROJECT_STATIC_LIBRARY_NAME}
+    )
+    set(WAS_TB_LINKED ON)
+elseif(TRANSFERBENCH_ENGINE_SHARED)
+    message(STATUS "  >> Client build with 'shared' library: ${AMD_PROJECT_SHARED_LIBRARY_NAME} ...")
+    target_link_libraries(${AMD_PROJECT_CLIENT_NAME}
+        PRIVATE
+            ${AMD_PROJECT_SHARED_LIBRARY_NAME}
+    )
+    set(WAS_TB_LINKED ON)
+elseif(TRANSFERBENCH_ENGINE_HEADER_ONLY)
+    message(STATUS "  >> Client build with 'header-only': ${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME} ...")
+    target_link_libraries(${AMD_PROJECT_CLIENT_NAME}
+        PRIVATE
+            ${AMD_PROJECT_HEADER_ONLY_LIBRARY_NAME}
+    )
+    target_compile_definitions(${AMD_PROJECT_CLIENT_NAME}
+        PRIVATE
+            TRANSFERBENCH_HEADER_IMPLEMENTATION_DETAILS
+    )
+endif()
+# Install client executable
+install(
+    TARGETS ${AMD_PROJECT_CLIENT_NAME}
+    RUNTIME DESTINATION bin
+)
--- a/client/VERSION
+++ b/client/VERSION
+1.1.0
--- a/src/client/Client.hpp
+++ b/src/client/Client.hpp
@@ -22,13 +22,21 @@ THE SOFTWARE.
 #pragma once
-// TransferBench client version
-#define CLIENT_VERSION "00"
 #include "TransferBench.hpp"
+#include "tbclient_version.hpp"
+// Helper function to print client version
+auto GetClientVersion() -> const std::string;
+/*
+ *  TODO: We need to look into this circular dependency (envVars->Client->envVars)
+ */
 #include "EnvVars.hpp"
-size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<28);
+size_t const DEFAULT_BYTES_PER_TRANSFER = (1 << 28);
 char const ExeTypeName[5][4] = {"CPU", "GPU", "DMA", "NIC", "NIC"};
@@ -39,7 +47,8 @@ void DisplayTopology(bool outputToCsv);
 void DisplayUsage(char const* cmdName);
 // Print TransferBench test results
-void PrintResults(EnvVars const& ev, int const testNum,
+void PrintResults(EnvVars const& ev,
+                  int const testNum,
                  std::vector<Transfer> const& transfers,
                  TransferBench::TestResults const& results);

--- a/client/include/EnvVars.hpp
+++ b/client/include/EnvVars.hpp
--- a/client/include/Presets/AllToAll.hpp
+++ b/client/include/Presets/AllToAll.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef ALL_TO_ALL_PRESET_HPP
+#define ALL_TO_ALL_PRESET_HPP
+#include "EnvVars.hpp"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+void AllToAllPreset(EnvVars& ev,
+                    size_t const numBytesPerTransfer,
+                    [[maybe_unused]] std::string const presetName)
+{
+    enum
+    {
+        A2A_COPY       = 0,
+        A2A_READ_ONLY  = 1,
+        A2A_WRITE_ONLY = 2,
+        A2A_CUSTOM     = 3,
+    };
+    char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
+    // Force single-stream mode for all-to-all benchmark
+    ev.useSingleStream = 1;
+    // Force to gfx unroll 2 unless explicitly set
+    ev.gfxUnroll = EnvVars::GetEnvVar("GFX_UNROLL", 2);
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    // Collect env vars for this preset
+    int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT", 1);
+    int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL", 0);
+    int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+    int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 0);
+    int numSubExecs   = EnvVars::GetEnvVar("NUM_SUB_EXEC", 8);
+    int useDmaExec    = EnvVars::GetEnvVar("USE_DMA_EXEC", 0);
+    int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN", 1);
+    int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+    // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
+    int numSrcs, numDsts;
+    int a2aMode = 0;
+    if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
+        a2aMode = A2A_CUSTOM;
+    } else {
+        a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
+        if (a2aMode < 0 || a2aMode > 2) {
+            printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
+            exit(1);
+        }
+        numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
+        numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
+    }
+    // Print off environment variables
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+        if (!ev.outputToCsv) { printf("[AllToAll Related]\n"); }
+        ev.Print(
+            "A2A_DIRECT", a2aDirect, a2aDirect ? "Only using direct links" : "Full all-to-all");
+        ev.Print("A2A_LOCAL", a2aLocal, "%s local transfers", a2aLocal ? "Include" : "Exclude");
+        ev.Print("A2A_MODE",
+                 (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts)
+                                         : std::to_string(a2aMode),
+                 (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
+                                            std::to_string(numDsts) + " write(s)")
+                                               .c_str()
+                                         : a2aModeStr[a2aMode]);
+        ev.Print("NUM_GPU_DEVICES", numGpus, "Using %d GPUs", numGpus);
+        ev.Print("NUM_QUEUE_PAIRS",
+                 numQueuePairs,
+                 "Using %d queue pairs for NIC transfers",
+                 numQueuePairs);
+        ev.Print(
+            "NUM_SUB_EXEC", numSubExecs, "Using %d subexecutors/CUs per Transfer", numSubExecs);
+        ev.Print("USE_DMA_EXEC", useDmaExec, "Using %s executor", useDmaExec ? "DMA" : "GFX");
+        ev.Print("USE_FINE_GRAIN",
+                 useFineGrain,
+                 "Using %s-grained memory",
+                 useFineGrain ? "fine" : "coarse");
+        ev.Print("USE_REMOTE_READ",
+                 useRemoteRead,
+                 "Using %s as executor",
+                 useRemoteRead ? "DST" : "SRC");
+        printf("\n");
+    }
+    // Validate env vars
+    if (numGpus < 0 || numGpus > numDetectedGpus) {
+        printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+        exit(1);
+    }
+    if (useDmaExec && (numSrcs != 1 || numDsts != 1)) {
+        printf("[ERROR] DMA execution can only be used for copies (A2A_MODE=0)\n");
+        exit(1);
+    }
+    // Collect the number of GPU devices to use
+    MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+    ExeType exeType = useDmaExec ? EXE_GPU_DMA : EXE_GPU_GFX;
+    std::map<std::pair<int, int>, int> reIndex;
+    std::vector<Transfer> transfers;
+    for (int i = 0; i < numGpus; i++) {
+        for (int j = 0; j < numGpus; j++) {
+            // Check whether or not to execute this pair
+            if (i == j) {
+                if (!a2aLocal) { continue; }
+            } else if (a2aDirect) {
+#if !defined(__NVCC__)
+                uint32_t linkType, hopCount;
+                HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+                if (hopCount != 1) { continue; }
+#endif
+            }
+            // Build Transfer and add it to list
+            TransferBench::Transfer transfer;
+            transfer.numBytes = numBytesPerTransfer;
+            for (int x = 0; x < numSrcs; x++) { transfer.srcs.push_back({memType, i}); }
+            // When using multiple destinations, the additional destinations are "local"
+            if (numDsts) { transfer.dsts.push_back({memType, j}); }
+            for (int x = 1; x < numDsts; x++) { transfer.dsts.push_back({memType, i}); }
+            transfer.exeDevice   = {exeType, (useRemoteRead ? j : i)};
+            transfer.exeSubIndex = -1;
+            transfer.numSubExecs = numSubExecs;
+            reIndex[std::make_pair(i, j)] = transfers.size();
+            transfers.push_back(transfer);
+        }
+    }
+    // Create a ring using NICs
+    std::vector<int> nicTransferIdx(numGpus);
+    if (numQueuePairs > 0) {
+        int numNics = TransferBench::GetNumExecutors(EXE_NIC);
+        for (int i = 0; i < numGpus; i++) {
+            TransferBench::Transfer transfer;
+            transfer.numBytes = numBytesPerTransfer;
+            transfer.srcs.push_back({memType, i});
+            transfer.dsts.push_back({memType, (i + 1) % numGpus});
+            transfer.exeDevice   = {TransferBench::EXE_NIC_NEAREST, i};
+            transfer.exeSubIndex = (i + 1) % numGpus;
+            transfer.numSubExecs = numQueuePairs;
+            nicTransferIdx[i]    = transfers.size();
+            transfers.push_back(transfer);
+        }
+    }
+    printf("GPU-GFX All-To-All benchmark:\n");
+    printf("==========================\n");
+    printf("- Copying %lu bytes between %s pairs of GPUs using %d CUs (%lu Transfers)\n",
+           numBytesPerTransfer,
+           a2aDirect ? "directly connected" : "all",
+           numSubExecs,
+           transfers.size());
+    if (transfers.size() == 0) { return; }
+    // Execute Transfers
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    TransferBench::TestResults results;
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults) { printf("%s\n", err.errMsg.c_str()); }
+        exit(0);
+    } else {
+        PrintResults(ev, 1, transfers, results);
+    }
+    // Print results
+    char separator = (ev.outputToCsv ? ',' : ' ');
+    printf("\nSummary: [%lu bytes per Transfer] [%s:%d] [%d Read(s) %d Write(s)]\n",
+           numBytesPerTransfer,
+           useDmaExec ? "DMA" : "GFX",
+           numSubExecs,
+           numSrcs,
+           numDsts);
+    printf("===========================================================================\n");
+    printf("SRC\\DST ");
+    for (int dst = 0; dst < numGpus; dst++) { printf("%cGPU %02d    ", separator, dst); }
+    if (numQueuePairs > 0) { printf("%cNIC(%02d QP)", separator, numQueuePairs); }
+    printf("   %cSTotal     %cActual\n", separator, separator);
+    double totalBandwidthGpu  = 0.0;
+    double minActualBandwidth = std::numeric_limits<double>::max();
+    double maxActualBandwidth = 0.0;
+    std::vector<double> colTotalBandwidth(numGpus + 2, 0.0);
+    for (int src = 0; src < numGpus; src++) {
+        double rowTotalBandwidth = 0;
+        int transferCount        = 0;
+        double minBandwidth      = std::numeric_limits<double>::max();
+        printf("GPU %02d", src);
+        for (int dst = 0; dst < numGpus; dst++) {
+            if (reIndex.count(std::make_pair(src, dst))) {
+                int const transferIdx                  = reIndex[std::make_pair(src, dst)];
+                TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
+                colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
+                rowTotalBandwidth += r.avgBandwidthGbPerSec;
+                totalBandwidthGpu += r.avgBandwidthGbPerSec;
+                minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
+                transferCount++;
+                printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
+            } else {
+                printf("%c%8s  ", separator, "N/A");
+            }
+        }
+        if (numQueuePairs > 0) {
+            TransferBench::TransferResult const& r = results.tfrResults[nicTransferIdx[src]];
+            colTotalBandwidth[numGpus] += r.avgBandwidthGbPerSec;
+            rowTotalBandwidth += r.avgBandwidthGbPerSec;
+            totalBandwidthGpu += r.avgBandwidthGbPerSec;
+            minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
+            transferCount++;
+            printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
+        }
+        double actualBandwidth = minBandwidth * transferCount;
+        printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
+        minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
+        maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
+        colTotalBandwidth[numGpus + 1] += rowTotalBandwidth;
+    }
+    printf("\nRTotal");
+    for (int dst = 0; dst < numGpus; dst++) {
+        printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
+    }
+    if (numQueuePairs > 0) { printf("%c%8.3f  ", separator, colTotalBandwidth[numGpus]); }
+    printf("   %c%8.3f   %c%8.3f   %c%8.3f\n",
+           separator,
+           colTotalBandwidth[numGpus + 1],
+           separator,
+           minActualBandwidth,
+           separator,
+           maxActualBandwidth);
+    printf("\n");
+    printf("Average   bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
+    printf("Aggregate bandwidth (GPU Timed): %8.3f GB/s\n", totalBandwidthGpu);
+    printf("Aggregate bandwidth (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+    PrintErrors(results.errResults);
+}
+#endif    // ALL_TO_ALL_PRESET_HPP
--- a/client/include/Presets/AllToAllN.hpp
+++ b/client/include/Presets/AllToAllN.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef ALL_TO_ALL_N_PRESET_HPP
+#define ALL_TO_ALL_N_PRESET_HPP
+#include "EnvVars.hpp"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+void AllToAllRdmaPreset(EnvVars& ev,
+                        size_t const numBytesPerTransfer,
+                        [[maybe_unused]] std::string const presetName)
+{
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    // Collect env vars for this preset
+    int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+    int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
+    int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN", 1);
+    // Print off environment variables
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+        if (!ev.outputToCsv) { printf("[AllToAll Network Related]\n"); }
+        ev.Print("NUM_GPU_DEVICES", numGpus, "Using %d GPUs", numGpus);
+        ev.Print("NUM_QUEUE_PAIRS",
+                 numQueuePairs,
+                 "Using %d queue pairs for NIC transfers",
+                 numQueuePairs);
+        ev.Print("USE_FINE_GRAIN",
+                 useFineGrain,
+                 "Using %s-grained memory",
+                 useFineGrain ? "fine" : "coarse");
+        printf("\n");
+    }
+    // Validate env vars
+    if (numGpus < 0 || numGpus > numDetectedGpus) {
+        printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+        exit(1);
+    }
+    MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+    std::map<std::pair<int, int>, int> reIndex;
+    std::vector<Transfer> transfers;
+    for (int i = 0; i < numGpus; i++) {
+        for (int j = 0; j < numGpus; j++) {
+            // Build Transfer and add it to list
+            TransferBench::Transfer transfer;
+            transfer.numBytes = numBytesPerTransfer;
+            transfer.srcs.push_back({memType, i});
+            transfer.dsts.push_back({memType, j});
+            transfer.exeDevice   = {EXE_NIC_NEAREST, i};
+            transfer.exeSubIndex = j;
+            transfer.numSubExecs = numQueuePairs;
+            reIndex[std::make_pair(i, j)] = transfers.size();
+            transfers.push_back(transfer);
+        }
+    }
+    printf("GPU-RDMA All-To-All benchmark:\n");
+    printf("==========================\n");
+    printf(
+        "- Copying %lu bytes between all pairs of GPUs using %d QPs per Transfer (%lu Transfers)\n",
+        numBytesPerTransfer,
+        numQueuePairs,
+        transfers.size());
+    if (transfers.size() == 0) { return; }
+    // Execute Transfers
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    TransferBench::TestResults results;
+    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+        for (auto const& err : results.errResults) { printf("%s\n", err.errMsg.c_str()); }
+        exit(0);
+    } else {
+        PrintResults(ev, 1, transfers, results);
+    }
+    // Print results
+    char separator = (ev.outputToCsv ? ',' : ' ');
+    printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
+    printf("==========================================================\n");
+    printf("SRC\\DST ");
+    for (int dst = 0; dst < numGpus; dst++) { printf("%cGPU %02d    ", separator, dst); }
+    printf("   %cSTotal     %cActual\n", separator, separator);
+    double totalBandwidthGpu  = 0.0;
+    double minActualBandwidth = std::numeric_limits<double>::max();
+    double maxActualBandwidth = 0.0;
+    std::vector<double> colTotalBandwidth(numGpus + 2, 0.0);
+    for (int src = 0; src < numGpus; src++) {
+        double rowTotalBandwidth = 0;
+        int transferCount        = 0;
+        double minBandwidth      = std::numeric_limits<double>::max();
+        printf("GPU %02d", src);
+        for (int dst = 0; dst < numGpus; dst++) {
+            if (reIndex.count(std::make_pair(src, dst))) {
+                int const transferIdx                  = reIndex[std::make_pair(src, dst)];
+                TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
+                colTotalBandwidth[dst] += r.avgBandwidthGbPerSec;
+                rowTotalBandwidth += r.avgBandwidthGbPerSec;
+                totalBandwidthGpu += r.avgBandwidthGbPerSec;
+                minBandwidth = std::min(minBandwidth, r.avgBandwidthGbPerSec);
+                transferCount++;
+                printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
+            } else {
+                printf("%c%8s  ", separator, "N/A");
+            }
+        }
+        double actualBandwidth = minBandwidth * transferCount;
+        printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
+        minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
+        maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
+        colTotalBandwidth[numGpus + 1] += rowTotalBandwidth;
+    }
+    printf("\nRTotal");
+    for (int dst = 0; dst < numGpus; dst++) {
+        printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
+    }
+    printf("   %c%8.3f   %c%8.3f   %c%8.3f\n",
+           separator,
+           colTotalBandwidth[numGpus + 1],
+           separator,
+           minActualBandwidth,
+           separator,
+           maxActualBandwidth);
+    printf("\n");
+    printf("Average   bandwidth (Tx Thread Timed): %8.3f GB/s\n",
+           totalBandwidthGpu / transfers.size());
+    printf("Aggregate bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu);
+    printf("Aggregate bandwidth       (CPU Timed): %8.3f GB/s\n",
+           results.avgTotalBandwidthGbPerSec);
+    PrintErrors(results.errResults);
+}
+#endif    // ALL_TO_ALL_N_PRESET_HPP
--- a/client/include/Presets/AllToAllSweep.hpp
+++ b/client/include/Presets/AllToAllSweep.hpp
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef ALL_TO_ALL_SWEEP_PRESET_HPP
+#define ALL_TO_ALL_SWEEP_PRESET_HPP
+#include "EnvVars.hpp"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+void AllToAllSweepPreset(EnvVars& ev,
+                         size_t const numBytesPerTransfer,
+                         [[maybe_unused]] std::string const presetName)
+{
+    enum
+    {
+        A2A_COPY       = 0,
+        A2A_READ_ONLY  = 1,
+        A2A_WRITE_ONLY = 2,
+        A2A_CUSTOM     = 3,
+    };
+    char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
+    // Force single-stream mode for all-to-all benchmark
+    ev.useSingleStream = 1;
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    // Collect env vars for this preset
+    int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT", 1);
+    int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL", 0);
+    int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+    int showMinOnly   = EnvVars::GetEnvVar("SHOW_MIN_ONLY", 1);
+    int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN", 1);
+    int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+    int useSpray      = EnvVars::GetEnvVar("USE_SPRAY", 0);
+    int verbose       = EnvVars::GetEnvVar("VERBOSE", 0);
+    std::vector<int> blockList  = EnvVars::GetEnvVarArray("BLOCKSIZES", {256});
+    std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1, 2, 3, 4, 6, 8});
+    std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4, 8, 12, 16, 24, 32});
+    // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
+    int numSrcs, numDsts;
+    int a2aMode = 0;
+    if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
+        a2aMode = A2A_CUSTOM;
+    } else {
+        a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
+        if (a2aMode < 0 || a2aMode > 2) {
+            printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
+            exit(1);
+        }
+        numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
+        numDsts = (a2aMode == A2A_READ_ONLY ? 0 : 1);
+    }
+    // Print off environment variables
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+        if (!ev.outputToCsv) { printf("[AllToAll Related]\n"); }
+        ev.Print(
+            "A2A_DIRECT", a2aDirect, a2aDirect ? "Only using direct links" : "Full all-to-all");
+        ev.Print("A2A_LOCAL", a2aLocal, "%s local transfers", a2aLocal ? "Include" : "Exclude");
+        ev.Print("A2A_MODE",
+                 (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts)
+                                         : std::to_string(a2aMode),
+                 (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
+                                            std::to_string(numDsts) + " write(s)")
+                                               .c_str()
+                                         : a2aModeStr[a2aMode]);
+        ev.Print("BLOCKSIZES", blockList.size(), EnvVars::ToStr(blockList).c_str());
+        ev.Print("SHOW_MIN_ONLY",
+                 showMinOnly,
+                 showMinOnly ? "Showing only slowest GPU results"
+                             : "Showing slowest and fastest GPU results");
+        ev.Print("NUM_CUS", numCusList.size(), EnvVars::ToStr(numCusList).c_str());
+        ev.Print("NUM_GPU_DEVICES", numGpus, "Using %d GPUs", numGpus);
+        ev.Print("UNROLLS", unrollList.size(), EnvVars::ToStr(unrollList).c_str());
+        ev.Print("USE_FINE_GRAIN",
+                 useFineGrain,
+                 "Using %s-grained memory",
+                 useFineGrain ? "fine" : "coarse");
+        ev.Print("USE_REMOTE_READ",
+                 useRemoteRead,
+                 "Using %s as executor",
+                 useRemoteRead ? "DST" : "SRC");
+        ev.Print("USE_SPRAY", useSpray, "%s per CU", useSpray ? "All targets" : "One target");
+        ev.Print("VERBOSE", verbose, verbose ? "Display test results" : "Display summary only");
+        printf("\n");
+    }
+    // Validate env vars
+    if (numGpus < 0 || numGpus > numDetectedGpus) {
+        printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+        exit(1);
+    }
+    if (useSpray && numDsts > 1) {
+        printf("[ERROR] Cannot use USE_SPRAY with multiple destination buffers\n");
+        exit(1);
+    }
+    // Collect the number of GPU devices to use
+    MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+    ExeType exeType = EXE_GPU_GFX;
+    std::vector<Transfer> transfers;
+    int targetCount = 0;
+    if (!useSpray) {
+        // Each CU will work on just one target
+        for (int i = 0; i < numGpus; i++) {
+            targetCount = 0;
+            for (int j = 0; j < numGpus; j++) {
+                // Check whether or not to execute this pair
+                if (i == j) {
+                    if (!a2aLocal) { continue; }
+                } else if (a2aDirect) {
+#if !defined(__NVCC__)
+                    uint32_t linkType, hopCount;
+                    HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+                    if (hopCount != 1) { continue; }
+#endif
+                }
+                // Build Transfer and add it to list
+                TransferBench::Transfer transfer;
+                targetCount++;
+                transfer.numBytes = numBytesPerTransfer;
+                for (int x = 0; x < numSrcs; x++) { transfer.srcs.push_back({memType, i}); }
+                // When using multiple destinations, the additional destinations are "local"
+                if (numDsts) { transfer.dsts.push_back({memType, j}); }
+                for (int x = 1; x < numDsts; x++) { transfer.dsts.push_back({memType, i}); }
+                transfer.exeDevice   = {exeType, (useRemoteRead ? j : i)};
+                transfer.exeSubIndex = -1;
+                transfers.push_back(transfer);
+            }
+        }
+    } else {
+        // Each CU will work on all targets
+        for (int i = 0; i < numGpus; i++) {
+            TransferBench::Transfer transfer;
+            transfer.numBytes    = numBytesPerTransfer;
+            transfer.exeDevice   = {exeType, i};
+            transfer.exeSubIndex = -1;
+            targetCount          = 0;
+            for (int j = 0; j < numGpus; j++) {
+                // Check whether or not to transfer to this GPU
+                if (i == j) {
+                    if (!a2aLocal) { continue; }
+                } else if (a2aDirect) {
+#if !defined(__NVCC__)
+                    uint32_t linkType, hopCount;
+                    HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+                    if (hopCount != 1) { continue; }
+#endif
+                }
+                targetCount++;
+                for (int x = 0; x < numSrcs; x++) {
+                    transfer.srcs.push_back({memType, useRemoteRead ? j : i});
+                }
+                if (numDsts) { transfer.dsts.push_back({memType, j}); }
+                for (int x = 1; x < numDsts; x++) { transfer.dsts.push_back({memType, i}); }
+            }
+            transfers.push_back(transfer);
+        }
+    }
+    printf("GPU-GFX All-To-All Sweep benchmark:\n");
+    printf("==========================\n");
+    printf("- Copying %lu bytes between %s pairs of GPUs\n",
+           numBytesPerTransfer,
+           a2aDirect ? "directly connected" : "all");
+    if (transfers.size() == 0) {
+        printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
+        return;
+    }
+    // Execute Transfers
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    // Run tests
+    std::map<std::pair<int, int>, TransferBench::TestResults> results;
+    // Display summary
+    for (int blockSize : blockList) {
+        printf("Blocksize: %d\n", blockSize);
+        ev.gfxBlockSize = cfg.gfx.blockSize = blockSize;
+        printf("#CUs\\Unroll");
+        for (int u : unrollList) {
+            printf("  %d(Min) ", u);
+            if (!showMinOnly) { printf("  %d(Max) ", u); }
+        }
+        printf("\n");
+        for (int c : numCusList) {
+            printf("   %5d   ", c);
+            fflush(stdout);
+            for (int u : unrollList) {
+                ev.gfxUnroll = cfg.gfx.unrollFactor = u;
+                for (auto& transfer : transfers) {
+                    transfer.numSubExecs = useSpray ? (c * targetCount) : c;
+                }
+                double minBandwidth = std::numeric_limits<double>::max();
+                double maxBandwidth = std::numeric_limits<double>::min();
+                TransferBench::TestResults result;
+                if (TransferBench::RunTransfers(cfg, transfers, result)) {
+                    for (auto const& exeResult : result.exeResults) {
+                        minBandwidth = std::min(minBandwidth,
+                                                exeResult.second.avgBandwidthGbPerSec);
+                        maxBandwidth = std::max(maxBandwidth,
+                                                exeResult.second.avgBandwidthGbPerSec);
+                    }
+                    if (useSpray) {
+                        minBandwidth *= targetCount;
+                        maxBandwidth *= targetCount;
+                    }
+                    results[std::make_pair(c, u)] = result;
+                } else {
+                    minBandwidth = 0.0;
+                }
+                printf(" %7.2f ", minBandwidth);
+                if (!showMinOnly) { printf(" %7.2f ", maxBandwidth); }
+                fflush(stdout);
+            }
+            printf("\n");
+            fflush(stdout);
+        }
+        if (verbose) {
+            int testNum = 0;
+            for (int c : numCusList) {
+                for (int u : unrollList) {
+                    printf("CUs: %d Unroll %d\n", c, u);
+                    PrintResults(ev, ++testNum, transfers, results[std::make_pair(c, u)]);
+                }
+            }
+        }
+    }
+}
+#endif    // ALL_TO_ALL_SWEEP_PRESET_HPP
--- a/client/include/Presets/HealthCheck.hpp
+++ b/client/include/Presets/HealthCheck.hpp
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef HEALTH_CHECK_PRESET_HPP
+#define HEALTH_CHECK_PRESET_HPP
+#include "EnvVars.hpp"
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <string>
+#include <vector>
+enum
+{
+    HBM_READ      = 0,
+    HBM_WRITE     = 1,
+    HBM_COPY      = 2,
+    HBM_ADD       = 3,
+    NUM_HBM_TESTS = 4
+} HbmTests;
+struct HbmTestConfig
+{
+    std::string name;
+    int numInputs;
+    int numOutputs;
+};
+HbmTestConfig HbmTestConfigs[NUM_HBM_TESTS] = {
+    {"READ", 1, 0}, {"WRITE", 0, 1}, {"COPY", 1, 1}, {"ADD", 2, 1}};
+typedef struct
+{
+        double unidirHostToDeviceCopyLimit;
+        double unidirDeviceToHostCopyLimit;
+        double bidirDmaCopyLimit;
+        int a2aUnrollFactor;
+        int a2aNumSubExecs;
+        double a2aCopyLimit;
+        int hbmBlockSize[NUM_HBM_TESTS];
+        int hbmUnrollFactor[NUM_HBM_TESTS];
+        int hbmTemporalMode[NUM_HBM_TESTS];
+        double hbmLimit[NUM_HBM_TESTS];
+} TestConfig;
+typedef enum
+{
+    MODEL_08_GFX0942_304 = 0,
+    MODEL_08_GFX0942_064 = 1,
+    NUM_SUPPORTED_MODELS = 2
+} ModelEnum;
+// All limits are scaled by this factor
+#define SFACTOR 0.97
+TestConfig Config_08_GFX0942_304 = {
+    .unidirHostToDeviceCopyLimit = 50,
+    .unidirDeviceToHostCopyLimit = 50,
+    .bidirDmaCopyLimit           = 90,
+    .a2aUnrollFactor             = 2,
+    .a2aNumSubExecs              = 8,
+    .a2aCopyLimit                = 45,
+    .hbmBlockSize                = {384, 256, 320, 256},
+    .hbmUnrollFactor             = {7, 4, 8, 7},
+    .hbmTemporalMode             = {3, 3, 3, 3},
+    .hbmLimit                    = {4980, 4850, 2045, 1405},
+};
+TestConfig Config_08_GFX0942_064 = {
+    .unidirHostToDeviceCopyLimit = 50,
+    .unidirDeviceToHostCopyLimit = 50,
+    .bidirDmaCopyLimit           = 90,
+    .a2aUnrollFactor             = 2,
+    .a2aNumSubExecs              = 8,
+    .a2aCopyLimit                = 45,
+    .hbmBlockSize                = {448, 448, 448, 384},
+    .hbmUnrollFactor             = {8, 3, 8, 7},
+    .hbmTemporalMode             = {3, 3, 3, 3},
+    .hbmLimit                    = {4180, 2800, 1400, 1055},
+};
+TestConfig TestConfigs[NUM_SUPPORTED_MODELS] = {
+    Config_08_GFX0942_304,
+    Config_08_GFX0942_064,
+};
+int DetectModel()
+{
+    int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    std::string archName = "";
+    int numSubExecutors  = 0;
+    // Loop over all GPUs and determine if they are identical
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+        // Check that arch name is identical
+        hipDeviceProp_t prop;
+        HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
+        std::string fullName     = prop.gcnArchName;
+        std::string currArchName = fullName.substr(0, fullName.find(':'));
+        if (archName != "" && archName != currArchName) {
+            printf(
+                "[WARN] healthcheck preset is currently only supported when all GPUs are "
+                "identical\n");
+            printf("       Detected both %s and %s\n", archName.c_str(), currArchName.c_str());
+            exit(1);
+        }
+        archName = currArchName;
+        // Check number of subexecutors
+        int currNumSubExecutors = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
+        if (numSubExecutors != 0 && numSubExecutors != currNumSubExecutors) {
+            printf(
+                "[WARN] healthcheck preset is currently only supported when all GPUs are "
+                "identical\n");
+            printf("       Detected different subexecutor counts: %d and %d\n",
+                   numSubExecutors,
+                   currNumSubExecutors);
+            exit(1);
+        }
+        numSubExecutors = currNumSubExecutors;
+    }
+    // Classify based on detected configuration
+    if (numGpuDevices == 8) {
+        if (archName == "gfx942") {
+            switch (numSubExecutors) {
+                case 304: return MODEL_08_GFX0942_304;
+                case 64: return MODEL_08_GFX0942_064;
+            }
+        }
+    }
+    printf("[WARN] healthcheck preset is currently not supported on this hardware\n");
+    printf("       Detected %d x [%s] with [%d] subexecutors per GPU\n",
+           numGpuDevices,
+           archName.c_str(),
+           numSubExecutors);
+    exit(1);
+}
+int TestUnidir(int modelId, bool verbose)
+{
+    TestConfig const& testConfig = TestConfigs[modelId];
+    TransferBench::ConfigOptions cfg;
+    TransferBench::TestResults results;
+    int hasFail        = 0;
+    int numGpuDevices  = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    cfg.dma.useHsaCopy = 1;
+    // Run unidirectional host to device copy
+    printf("Testing unidirectional host to device copy%c", verbose ? '\n' : ' ');
+    {
+        double limit = testConfig.unidirHostToDeviceCopyLimit * SFACTOR;
+        std::vector<std::pair<int, double>> fails;
+        for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+            if (!verbose) { printf("."); }
+            fflush(stdout);
+            int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+            if (memIndex == -1) {
+                printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+                exit(1);
+            }
+            std::vector<Transfer> transfers(1);
+            Transfer& t   = transfers[0];
+            t.exeDevice   = {EXE_GPU_DMA, gpuId};
+            t.numBytes    = 256 * 1024 * 1024;
+            t.srcs        = {{MEM_CPU, memIndex}};
+            t.dsts        = {{MEM_GPU, gpuId}};
+            t.numSubExecs = 1;
+            if (TransferBench::RunTransfers(cfg, transfers, results)) {
+                double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
+                if (measuredBw < limit) { fails.push_back(std::make_pair(gpuId, measuredBw)); }
+                if (verbose) {
+                    printf("   GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
+                }
+            } else {
+                PrintErrors(results.errResults);
+            }
+        }
+        if (fails.size() == 0) {
+            printf("PASS\n");
+        } else {
+            hasFail = 1;
+            printf("FAIL (%lu test(s))\n", fails.size());
+            for (auto p : fails) {
+                printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n",
+                       p.first,
+                       p.second,
+                       limit);
+            }
+        }
+    }
+    // Run unidirectional device to host copy
+    printf("Testing unidirectional device to host copy%c", verbose ? '\n' : ' ');
+    {
+        double limit = testConfig.unidirDeviceToHostCopyLimit * SFACTOR;
+        std::vector<std::pair<int, double>> fails;
+        for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+            if (!verbose) { printf("."); }
+            fflush(stdout);
+            int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+            if (memIndex == -1) {
+                printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+                exit(1);
+            }
+            std::vector<Transfer> transfers(1);
+            Transfer& t   = transfers[0];
+            t.exeDevice   = {EXE_GPU_DMA, gpuId};
+            t.numBytes    = 256 * 1024 * 1024;
+            t.srcs        = {{MEM_GPU, gpuId}};
+            t.dsts        = {{MEM_CPU, memIndex}};
+            t.numSubExecs = 1;
+            if (TransferBench::RunTransfers(cfg, transfers, results)) {
+                double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
+                if (measuredBw < limit) { fails.push_back(std::make_pair(gpuId, measuredBw)); }
+                if (verbose) {
+                    printf("   GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
+                }
+            } else {
+                PrintErrors(results.errResults);
+            }
+        }
+        if (fails.size() == 0) {
+            printf("PASS\n");
+        } else {
+            hasFail = 1;
+            printf("FAIL (%lu test(s))\n", fails.size());
+            for (auto p : fails) {
+                printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n",
+                       p.first,
+                       p.second,
+                       limit);
+            }
+        }
+    }
+    return hasFail;
+}
+int TestBidir(int modelId, bool verbose)
+{
+    TestConfig const& testConfig = TestConfigs[modelId];
+    TransferBench::ConfigOptions cfg;
+    int hasFail       = 0;
+    int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    printf("Testing bidirectional host<->device copies%c", verbose ? '\n' : ' ');
+    {
+        double limit = testConfig.bidirDmaCopyLimit * SFACTOR;
+        std::vector<std::pair<int, double>> fails;
+        for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+            if (!verbose) { printf("."); }
+            fflush(stdout);
+            int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
+            if (memIndex == -1) {
+                printf("[ERROR] Unable to detect closest CPU NUMA node to GPU %d\n", gpuId);
+                exit(1);
+            }
+            std::vector<Transfer> transfers(2);
+            Transfer& t0 = transfers[0];
+            Transfer& t1 = transfers[1];
+            t0.exeDevice   = {EXE_GPU_DMA, gpuId};
+            t0.numBytes    = 256 * 1024 * 1024;
+            t0.srcs        = {{MEM_GPU, gpuId}};
+            t0.dsts        = {{MEM_CPU, memIndex}};
+            t0.numSubExecs = 1;
+            t1.exeDevice   = {EXE_GPU_DMA, gpuId};
+            t1.numBytes    = 256 * 1024 * 1024;
+            t1.srcs        = {{MEM_CPU, memIndex}};
+            t1.dsts        = {{MEM_GPU, gpuId}};
+            t1.numSubExecs = 1;
+            TransferBench::TestResults results;
+            if (TransferBench::RunTransfers(cfg, transfers, results)) {
+                double measuredBw = (results.tfrResults[0].avgBandwidthGbPerSec +
+                                     results.tfrResults[1].avgBandwidthGbPerSec);
+                if (measuredBw < limit) { fails.push_back(std::make_pair(gpuId, measuredBw)); }
+                if (verbose) {
+                    printf("   GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
+                }
+            } else {
+                PrintErrors(results.errResults);
+            }
+        }
+        if (fails.size() == 0) {
+            printf("PASS\n");
+        } else {
+            hasFail = 1;
+            printf("FAIL (%lu test(s))\n", fails.size());
+            for (auto p : fails) {
+                printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n",
+                       p.first,
+                       p.second,
+                       limit);
+            }
+        }
+    }
+    return hasFail;
+}
+int TestAllToAll(int modelId, bool verbose)
+{
+    TestConfig const& testConfig = TestConfigs[modelId];
+    TransferBench::ConfigOptions cfg;
+    cfg.gfx.unrollFactor = testConfig.a2aUnrollFactor;
+    int numSubExecs   = testConfig.a2aNumSubExecs;
+    int hasFail       = 0;
+    int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    printf("Testing all-to-all XGMI copies            %c", verbose ? '\n' : ' ');
+    fflush(stdout);
+    {
+        double limit = testConfig.a2aCopyLimit * SFACTOR;
+        std::vector<Transfer> transfers;
+        for (int i = 0; i < numGpuDevices; i++) {
+            for (int j = 0; j < numGpuDevices; j++) {
+                if (i == j) { continue; }
+                Transfer t;
+                t.numBytes    = 256 * 1024 * 1024;
+                t.numSubExecs = numSubExecs;
+                t.exeDevice   = {EXE_GPU_GFX, i};
+                t.srcs        = {{MEM_GPU_FINE, i}};
+                t.dsts        = {{MEM_GPU_FINE, j}};
+                transfers.push_back(t);
+            }
+        }
+        std::vector<std::pair<std::pair<int, int>, double>> fails;
+        TransferBench::TestResults results;
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+            int transferIdx = 0;
+            for (int i = 0; i < numGpuDevices; i++) {
+                if (!verbose) { printf("."); }
+                fflush(stdout);
+                for (int j = 0; j < numGpuDevices; j++) {
+                    if (i == j) { continue; }
+                    double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
+                    if (bw < limit) { fails.push_back(std::make_pair(std::make_pair(i, j), bw)); }
+                    if (verbose) {
+                        printf("   GPU %02d to GPU %02d: : Measured %6.2f Limit %6.2f\n",
+                               i,
+                               j,
+                               bw,
+                               limit);
+                    }
+                    transferIdx++;
+                }
+            }
+        }
+        if (fails.size() == 0) {
+            printf("PASS\n");
+        } else {
+            hasFail = 1;
+            printf("FAIL (%lu test(s))\n", fails.size());
+            for (auto p : fails) {
+                printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n",
+                       p.first.first,
+                       p.first.second,
+                       p.second,
+                       limit);
+            }
+        }
+    }
+    return hasFail;
+}
+int TestHbmPerformance(int modelId, bool verbose)
+{
+    TestConfig const& testConfig = TestConfigs[modelId];
+    int hasFail       = 0;
+    int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    char testname[50];
+    for (int testId = 0; testId < NUM_HBM_TESTS; testId++) {
+        TransferBench::ConfigOptions cfg;
+        cfg.general.numIterations = 1000;
+        cfg.general.numWarmups    = 50;
+        cfg.gfx.blockSize         = testConfig.hbmBlockSize[testId];
+        cfg.gfx.unrollFactor      = testConfig.hbmUnrollFactor[testId];
+        cfg.gfx.temporalMode      = testConfig.hbmTemporalMode[testId];
+        sprintf(testname, "Testing HBM performance [%s]", HbmTestConfigs[testId].name.c_str());
+        if (verbose) {
+            printf("[Blocksize: %d Unroll: %d TemporalMode: %d]\n",
+                   cfg.gfx.blockSize,
+                   cfg.gfx.unrollFactor,
+                   cfg.gfx.temporalMode);
+        }
+        printf("%-42s%c", testname, verbose ? '\n' : ' ');
+        fflush(stdout);
+        int numInputs  = HbmTestConfigs[testId].numInputs;
+        int numOutputs = HbmTestConfigs[testId].numOutputs;
+        double limit = testConfig.hbmLimit[testId] * SFACTOR;
+        std::vector<std::pair<int, double>> fails;
+        TransferBench::TestResults results;
+        std::vector<Transfer> transfers;
+        for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+            Transfer t;
+            t.numSubExecs = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
+            t.numBytes    = 16777216ULL * t.numSubExecs;
+            t.exeDevice   = {EXE_GPU_GFX, gpuId};
+            for (int i = 0; i < numInputs; i++) { t.srcs.push_back({MEM_GPU, gpuId}); }
+            for (int i = 0; i < numOutputs; i++) { t.dsts.push_back({MEM_GPU, gpuId}); }
+            transfers.push_back(t);
+        }
+        if (TransferBench::RunTransfers(cfg, transfers, results)) {
+            for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+                if (!verbose) { printf("."); }
+                fflush(stdout);
+                double measuredBw = results.tfrResults[gpuId].avgBandwidthGbPerSec;
+                if (measuredBw < limit) { fails.push_back(std::make_pair(gpuId, measuredBw)); }
+                if (verbose) {
+                    printf("   GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
+                }
+            }
+        } else {
+            PrintErrors(results.errResults);
+        }
+        if (fails.size() == 0) {
+            printf("PASS\n");
+        } else {
+            hasFail = 1;
+            printf("FAIL (%lu test(s))\n", fails.size());
+            for (auto p : fails) {
+                printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n",
+                       p.first,
+                       p.second,
+                       limit);
+            }
+        }
+    }
+    return hasFail;
+}
+void HealthCheckPreset([[maybe_unused]] EnvVars& ev,
+                       [[maybe_unused]] size_t const numBytesPerTransfer,
+                       [[maybe_unused]] std::string const presetName)
+{
+    // Check for supported platforms
+#if defined(__NVCC__)
+    printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
+    return;
+#endif
+    printf("Disclaimer:\n");
+    printf("==================================================================\n");
+    printf("NOTE: This is an experimental feature and may be subject to change\n");
+    printf("      Failures do not necessarily indicate hardware issues, as other factors\n");
+    printf("      such as simultaneous workloads may influence results\n");
+    printf("\n");
+    // Collect custom env vars for this preset
+    int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
+    // Determine if this is a supported model
+    int modelId = DetectModel();
+    // Run through all tests
+    int numFails = 0;
+    numFails += TestHbmPerformance(modelId, verbose);
+    numFails += TestUnidir(modelId, verbose);
+    numFails += TestBidir(modelId, verbose);
+    numFails += TestAllToAll(modelId, verbose);
+    exit(numFails ? 1 : 0);
+}
+#endif    // HEALTH_CHECK_PRESET_HPP
--- a/client/include/Presets/OneToAll.hpp
+++ b/client/include/Presets/OneToAll.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef ONE_TO_ALL_PRESET_HPP
+#define ONE_TO_ALL_PRESET_HPP
+#include "EnvVars.hpp"
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+void OneToAllPreset(EnvVars& ev,
+                    size_t const numBytesPerTransfer,
+                    [[maybe_unused]] std::string const presetName)
+{
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    if (numDetectedGpus < 2) {
+        printf("[ERROR] One-to-all benchmark requires machine with at least 2 GPUs\n");
+        exit(1);
+    }
+    // Collect env vars for this preset
+    int numGpuDevices    = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+    int numSubExecs      = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
+    int exeIndex         = EnvVars::GetEnvVar("EXE_INDEX", 0);
+    int sweepDir         = EnvVars::GetEnvVar("SWEEP_DIR", 0);
+    std::string sweepDst = EnvVars::GetEnvVar("SWEEP_DST", "G");
+    std::string sweepExe = EnvVars::GetEnvVar("SWEEP_EXE", "G");
+    std::string sweepSrc = EnvVars::GetEnvVar("SWEEP_SRC", "G");
+    int sweepMin         = EnvVars::GetEnvVar("SWEEP_MIN", 1);
+    int sweepMax         = EnvVars::GetEnvVar("SWEEP_MAX", numGpuDevices);
+    // Display environment variables
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+        if (!ev.outputToCsv) { printf("[One-To-All Related]\n"); }
+        ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
+        ev.Print("NUM_GPU_SE", numSubExecs, "Using %d subExecutors/CUs per Transfer", numSubExecs);
+        ev.Print("EXE_INDEX", exeIndex, "Executing on GPU %d", exeIndex);
+        ev.Print("SWEEP_DIR", sweepDir, "Direction of transfer");
+        ev.Print("SWEEP_DST", sweepDst.c_str(), "DST memory types to sweep");
+        ev.Print("SWEEP_EXE", sweepExe.c_str(), "Executor type to use");
+        ev.Print("SWEEP_MAX", sweepMax, "Maximum number of peers");
+        ev.Print("SWEEP_MIN", sweepMin, "Minimum number of peers");
+        ev.Print("SWEEP_SRC", sweepSrc.c_str(), "SRC memory types to sweep");
+        printf("\n");
+    }
+    // Perform validation
+    for (auto ch : sweepExe) {
+        if (ch != 'G' && ch != 'D') {
+            printf("[ERROR] Unrecognized executor type '%c' specified\n", ch);
+            exit(1);
+        }
+    }
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    TransferBench::TestResults results;
+    char const sep = (ev.outputToCsv ? ',' : ' ');
+    for (char src : sweepSrc) {
+        for (char exe : sweepExe) {
+            for (char dst : sweepDst) {
+                // Skip invalid configurations
+                if ((exe == 'D' && (src == 'N' || dst == 'N')) || (src == 'N' && dst == 'N')) {
+                    continue;
+                }
+                printf("Executing (%c%s -> %c%d -> %c%s)\n",
+                       src,
+                       src == 'N' ? ""
+                                  : (sweepDir == 0 ? std::to_string(exeIndex).c_str()
+                                                   : std::string("*").c_str()),
+                       exe,
+                       exeIndex,
+                       dst,
+                       dst == 'N'      ? ""
+                       : sweepDir == 0 ? std::string("*").c_str()
+                                       : std::to_string(exeIndex).c_str());
+                for (int i = 0; i < numGpuDevices; i++) {
+                    if (i == exeIndex) { continue; }
+                    printf("   GPU %-3d  %c", i, sep);
+                }
+                printf("\n");
+                if (!ev.outputToCsv) {
+                    for (int i = 0; i < numGpuDevices - 1; i++) { printf("-------------"); }
+                    printf("\n");
+                }
+                for (int p = sweepMin; p <= sweepMax; p++) {
+                    for (int bitmask = 0; bitmask < (1 << numGpuDevices); bitmask++) {
+                        if (bitmask & (1 << exeIndex) || __builtin_popcount(bitmask) != p) {
+                            continue;
+                        }
+                        std::vector<Transfer> transfers;
+                        for (int i = 0; i < numGpuDevices; i++) {
+                            if (bitmask & (1 << i)) {
+                                Transfer t;
+                                CheckForError(
+                                    TransferBench::CharToExeType(exe, t.exeDevice.exeType));
+                                t.exeDevice.exeIndex = exeIndex;
+                                t.exeSubIndex        = -1;
+                                t.numSubExecs        = numSubExecs;
+                                t.numBytes           = numBytesPerTransfer;
+                                if (src == 'N') {
+                                    t.srcs.clear();
+                                } else {
+                                    t.srcs.resize(1);
+                                    CheckForError(
+                                        TransferBench::CharToMemType(src, t.srcs[0].memType));
+                                    t.srcs[0].memIndex = sweepDir == 0 ? exeIndex : i;
+                                }
+                                if (dst == 'N') {
+                                    t.dsts.clear();
+                                } else {
+                                    t.dsts.resize(1);
+                                    CheckForError(
+                                        TransferBench::CharToMemType(dst, t.dsts[0].memType));
+                                    t.dsts[0].memIndex = sweepDir == 0 ? i : exeIndex;
+                                }
+                                transfers.push_back(t);
+                            }
+                        }
+                        if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+                            PrintErrors(results.errResults);
+                            exit(1);
+                        }
+                        int counter = 0;
+                        for (int i = 0; i < numGpuDevices; i++) {
+                            if (bitmask & (1 << i)) {
+                                printf("  %8.3f  %c",
+                                       results.tfrResults[counter++].avgBandwidthGbPerSec,
+                                       sep);
+                            } else if (i != exeIndex) {
+                                printf("            %c", sep);
+                            }
+                        }
+                        printf(" %d %d", p, numSubExecs);
+                        for (auto i = std::size_t(0); i < transfers.size(); i++) {
+                            printf(" (%s %c%d %s)",
+                                   MemDevicesToStr(transfers[i].srcs).c_str(),
+                                   ExeTypeStr[transfers[i].exeDevice.exeType],
+                                   transfers[i].exeDevice.exeIndex,
+                                   MemDevicesToStr(transfers[i].dsts).c_str());
+                        }
+                        printf("\n");
+                    }
+                }
+            }
+        }
+    }
+}
+#endif    // ONE_TO_ALL_PRESET_HPP
--- a/client/include/Presets/PeerToPeer.hpp
+++ b/client/include/Presets/PeerToPeer.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef PEER_TO_PEER_PRESET_HPP
+#define PEER_TO_PEER_PRESET_HPP
+#include "EnvVars.hpp"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+void PeerToPeerPreset(EnvVars& ev,
+                      size_t const numBytesPerTransfer,
+                      [[maybe_unused]] std::string const presetName)
+{
+    int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    // Collect env vars for this preset
+    int useDmaCopy     = EnvVars::GetEnvVar("USE_GPU_DMA", 0);
+    int numCpuDevices  = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
+    int numCpuSubExecs = EnvVars::GetEnvVar("NUM_CPU_SE", 4);
+    int numGpuDevices  = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+    int numGpuSubExecs = EnvVars::GetEnvVar(
+        "NUM_GPU_SE", useDmaCopy ? 1 : TransferBench::GetNumSubExecutors({EXE_GPU_GFX, 0}));
+    int p2pMode       = EnvVars::GetEnvVar("P2P_MODE", 0);
+    int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
+    int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+    // Display environment variables
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+        int outputToCsv = ev.outputToCsv;
+        if (!outputToCsv) { printf("[P2P Related]\n"); }
+        ev.Print("NUM_CPU_DEVICES", numCpuDevices, "Using %d CPUs", numCpuDevices);
+        ev.Print("NUM_CPU_SE", numCpuSubExecs, "Using %d CPU threads per Transfer", numCpuSubExecs);
+        ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
+        ev.Print("NUM_GPU_SE",
+                 numGpuSubExecs,
+                 "Using %d GPU subexecutors/CUs per Transfer",
+                 numGpuSubExecs);
+        ev.Print("P2P_MODE",
+                 p2pMode,
+                 "Running %s transfers",
+                 p2pMode == 0   ? "Uni + Bi"
+                 : p2pMode == 1 ? "Unidirectional"
+                                : "Bidirectional");
+        ev.Print("USE_FINE_GRAIN",
+                 useFineGrain,
+                 "Using %s-grained memory",
+                 useFineGrain ? "fine" : "coarse");
+        ev.Print(
+            "USE_GPU_DMA", useDmaCopy, "Using GPU-%s as GPU executor", useDmaCopy ? "DMA" : "GFX");
+        ev.Print("USE_REMOTE_READ",
+                 useRemoteRead,
+                 "Using %s as executor",
+                 useRemoteRead ? "DST" : "SRC");
+        printf("\n");
+    }
+    char const separator = ev.outputToCsv ? ',' : ' ';
+    printf("Bytes Per Direction%c%lu\n", separator, numBytesPerTransfer);
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    TransferBench::TestResults results;
+    // Collect the number of available CPUs/GPUs on this machine
+    int const numDevices = numCpuDevices + numGpuDevices;
+    // Perform unidirectional / bidirectional
+    for (int isBidirectional = 0; isBidirectional <= 1; isBidirectional++) {
+        if (((p2pMode == 1) && (isBidirectional == 1)) ||
+            ((p2pMode == 2) && (isBidirectional == 0))) {
+            continue;
+        }
+        printf("%sdirectional copy peak bandwidth GB/s [%s read / %s write] (GPU-Executor: %s)\n",
+               isBidirectional ? "Bi" : "Uni",
+               useRemoteRead ? "Remote" : "Local",
+               useRemoteRead ? "Local" : "Remote",
+               useDmaCopy ? "DMA" : "GFX");
+        // Print header
+        if (isBidirectional) {
+            printf("%12s", "SRC\\DST");
+        } else {
+            if (useRemoteRead) {
+                printf("%12s", "SRC\\EXE+DST");
+            } else {
+                printf("%12s", "SRC+EXE\\DST");
+            }
+        }
+        if (ev.outputToCsv) { printf(","); }
+        for (int i = 0; i < numCpuDevices; i++) {
+            printf("%7s %02d", "CPU", i);
+            if (ev.outputToCsv) { printf(","); }
+        }
+        if (numCpuDevices > 0) { printf("   "); }
+        for (int i = 0; i < numGpuDevices; i++) {
+            printf("%7s %02d", "GPU", i);
+            if (ev.outputToCsv) { printf(","); }
+        }
+        printf("\n");
+        double avgBwSum[2][2] = {};
+        int avgCount[2][2]    = {};
+        ExeType const gpuExeType = useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
+        // Loop over all possible src/dst pairs
+        for (int src = 0; src < numDevices; src++) {
+            MemType const srcType       = (src < numCpuDevices ? MEM_CPU : MEM_GPU);
+            int const srcIndex          = (srcType == MEM_CPU ? src : src - numCpuDevices);
+            MemType const srcTypeActual = ((useFineGrain && srcType == MEM_CPU)   ? MEM_CPU_FINE
+                                           : (useFineGrain && srcType == MEM_GPU) ? MEM_GPU_FINE
+                                                                                  : srcType);
+            std::vector<std::vector<double>> avgBandwidth(isBidirectional + 1);
+            std::vector<std::vector<double>> minBandwidth(isBidirectional + 1);
+            std::vector<std::vector<double>> maxBandwidth(isBidirectional + 1);
+            std::vector<std::vector<double>> stdDev(isBidirectional + 1);
+            if (src == numCpuDevices && src != 0) { printf("\n"); }
+            for (int dst = 0; dst < numDevices; dst++) {
+                MemType const dstType       = (dst < numCpuDevices ? MEM_CPU : MEM_GPU);
+                int const dstIndex          = (dstType == MEM_CPU ? dst : dst - numCpuDevices);
+                MemType const dstTypeActual = ((useFineGrain && dstType == MEM_CPU)   ? MEM_CPU_FINE
+                                               : (useFineGrain && dstType == MEM_GPU) ? MEM_GPU_FINE
+                                                                                      : dstType);
+                // Prepare Transfers
+                std::vector<Transfer> transfers(isBidirectional + 1);
+                // SRC -> DST
+                transfers[0].numBytes = numBytesPerTransfer;
+                transfers[0].srcs.push_back({srcTypeActual, srcIndex});
+                transfers[0].dsts.push_back({dstTypeActual, dstIndex});
+                transfers[0].exeDevice = {
+                    IsGpuMemType(useRemoteRead ? dstType : srcType) ? gpuExeType : EXE_CPU,
+                    (useRemoteRead ? dstIndex : srcIndex)};
+                transfers[0].exeSubIndex = -1;
+                transfers[0].numSubExecs = (transfers[0].exeDevice.exeType == gpuExeType)
+                                               ? numGpuSubExecs
+                                               : numCpuSubExecs;
+                // DST -> SRC
+                if (isBidirectional) {
+                    transfers[1].numBytes = numBytesPerTransfer;
+                    transfers[1].srcs.push_back({dstTypeActual, dstIndex});
+                    transfers[1].dsts.push_back({srcTypeActual, srcIndex});
+                    transfers[1].exeDevice = {
+                        IsGpuMemType(useRemoteRead ? srcType : dstType) ? gpuExeType : EXE_CPU,
+                        (useRemoteRead ? srcIndex : dstIndex)};
+                    transfers[1].exeSubIndex = -1;
+                    transfers[1].numSubExecs = (transfers[1].exeDevice.exeType == gpuExeType)
+                                                   ? numGpuSubExecs
+                                                   : numCpuSubExecs;
+                }
+                bool skipTest = false;
+                // Abort if executing on NUMA node with no CPUs
+                for (int i = 0; i <= isBidirectional; i++) {
+                    if (transfers[i].exeDevice.exeType == EXE_CPU &&
+                        TransferBench::GetNumSubExecutors(transfers[i].exeDevice) == 0) {
+                        skipTest = true;
+                        break;
+                    }
+#if defined(__NVCC__)
+                    // NVIDIA platform cannot access GPU memory directly from CPU executors
+                    if (transfers[i].exeDevice.exeType == EXE_CPU &&
+                        (IsGpuMemType(srcType) || IsGpuMemType(dstType))) {
+                        skipTest = true;
+                        break;
+                    }
+#endif
+                }
+                if (isBidirectional && srcType == dstType && srcIndex == dstIndex) {
+                    skipTest = true;
+                }
+                if (!skipTest) {
+                    if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+                        for (auto const& err : results.errResults) {
+                            printf("%s\n", err.errMsg.c_str());
+                        }
+                        exit(1);
+                    }
+                    for (int dir = 0; dir <= isBidirectional; dir++) {
+                        double const avgBw = results.tfrResults[dir].avgBandwidthGbPerSec;
+                        avgBandwidth[dir].push_back(avgBw);
+                        if (!(srcType == dstType && srcIndex == dstIndex)) {
+                            avgBwSum[srcType][dstType] += avgBw;
+                            avgCount[srcType][dstType]++;
+                        }
+                        if (ev.showIterations) {
+                            double minTime = results.tfrResults[dir].perIterMsec[0];
+                            double maxTime = minTime;
+                            double varSum  = 0;
+                            for (auto i = std::size_t(0);
+                                 i < results.tfrResults[dir].perIterMsec.size();
+                                 i++) {
+                                minTime = std::min(minTime, results.tfrResults[dir].perIterMsec[i]);
+                                maxTime = std::max(maxTime, results.tfrResults[dir].perIterMsec[i]);
+                                double const bw = (transfers[dir].numBytes / 1.0E9) /
+                                                  results.tfrResults[dir].perIterMsec[i] * 1000.0f;
+                                double const delta = (avgBw - bw);
+                                varSum += delta * delta;
+                            }
+                            double const minBw = (transfers[dir].numBytes / 1.0E9) / maxTime *
+                                                 1000.0f;
+                            double const maxBw = (transfers[dir].numBytes / 1.0E9) / minTime *
+                                                 1000.0f;
+                            double const stdev = sqrt(varSum /
+                                                      results.tfrResults[dir].perIterMsec.size());
+                            minBandwidth[dir].push_back(minBw);
+                            maxBandwidth[dir].push_back(maxBw);
+                            stdDev[dir].push_back(stdev);
+                        }
+                    }
+                } else {
+                    for (int dir = 0; dir <= isBidirectional; dir++) {
+                        avgBandwidth[dir].push_back(0);
+                        minBandwidth[dir].push_back(0);
+                        maxBandwidth[dir].push_back(0);
+                        stdDev[dir].push_back(-1.0);
+                    }
+                }
+            }
+            for (int dir = 0; dir <= isBidirectional; dir++) {
+                printf("%5s %02d %3s",
+                       (srcType == MEM_CPU) ? "CPU" : "GPU",
+                       srcIndex,
+                       dir ? "<- " : " ->");
+                if (ev.outputToCsv) { printf(","); }
+                for (int dst = 0; dst < numDevices; dst++) {
+                    if (dst == numCpuDevices && dst != 0) { printf("   "); }
+                    double const avgBw = avgBandwidth[dir][dst];
+                    if (avgBw == 0.0) {
+                        printf("%10s", "N/A");
+                    } else {
+                        printf("%10.2f", avgBw);
+                    }
+                    if (ev.outputToCsv) { printf(","); }
+                }
+                printf("\n");
+                if (ev.showIterations) {
+                    // minBw
+                    printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "min");
+                    if (ev.outputToCsv) { printf(","); }
+                    for (int i = 0; i < numDevices; i++) {
+                        double const minBw = minBandwidth[dir][i];
+                        if (i == numCpuDevices && i != 0) { printf("   "); }
+                        if (minBw == 0.0) {
+                            printf("%10s", "N/A");
+                        } else {
+                            printf("%10.2f", minBw);
+                        }
+                        if (ev.outputToCsv) { printf(","); }
+                    }
+                    printf("\n");
+                    // maxBw
+                    printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "max");
+                    if (ev.outputToCsv) { printf(","); }
+                    for (int i = 0; i < numDevices; i++) {
+                        double const maxBw = maxBandwidth[dir][i];
+                        if (i == numCpuDevices && i != 0) { printf("   "); }
+                        if (maxBw == 0.0) {
+                            printf("%10s", "N/A");
+                        } else {
+                            printf("%10.2f", maxBw);
+                        }
+                        if (ev.outputToCsv) { printf(","); }
+                    }
+                    printf("\n");
+                    // stddev
+                    printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, " sd");
+                    if (ev.outputToCsv) { printf(","); }
+                    for (int i = 0; i < numDevices; i++) {
+                        double const sd = stdDev[dir][i];
+                        if (i == numCpuDevices && i != 0) { printf("   "); }
+                        if (sd == -1.0) {
+                            printf("%10s", "N/A");
+                        } else {
+                            printf("%10.2f", sd);
+                        }
+                        if (ev.outputToCsv) { printf(","); }
+                    }
+                    printf("\n");
+                }
+                fflush(stdout);
+            }
+            if (isBidirectional) {
+                printf("%5s %02d %3s", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex, "<->");
+                if (ev.outputToCsv) { printf(","); }
+                for (int dst = 0; dst < numDevices; dst++) {
+                    double const sumBw = avgBandwidth[0][dst] + avgBandwidth[1][dst];
+                    if (dst == numCpuDevices && dst != 0) { printf("   "); }
+                    if (sumBw == 0.0) {
+                        printf("%10s", "N/A");
+                    } else {
+                        printf("%10.2f", sumBw);
+                    }
+                    if (ev.outputToCsv) { printf(","); }
+                }
+                printf("\n");
+                if (src < numDevices - 1) { printf("\n"); }
+            }
+        }
+        if (!ev.outputToCsv) {
+            printf("                         ");
+            for (int srcType : {MEM_CPU, MEM_GPU}) {
+                for (int dstType : {MEM_CPU, MEM_GPU}) {
+                    printf("  %cPU->%cPU",
+                           srcType == MEM_CPU ? 'C' : 'G',
+                           dstType == MEM_CPU ? 'C' : 'G');
+                }
+            }
+            printf("\n");
+            printf("Averages (During %s):", isBidirectional ? " BiDir" : "UniDir");
+            for (int srcType : {MEM_CPU, MEM_GPU}) {
+                for (int dstType : {MEM_CPU, MEM_GPU}) {
+                    if (avgCount[srcType][dstType]) {
+                        printf("%10.2f", avgBwSum[srcType][dstType] / avgCount[srcType][dstType]);
+                    } else {
+                        printf("%10s", "N/A");
+                    }
+                }
+            }
+            printf("\n\n");
+        }
+    }
+}
+#endif    // PEER_TO_PEER_PRESET_HPP
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
@@ -32,43 +32,49 @@ THE SOFTWARE.
 #include "Scaling.hpp"
 #include "Schmoo.hpp"
 #include "Sweep.hpp"
 #include <map>
+#include <utility>
-typedef void (*PresetFunc)(EnvVars&          ev,
+typedef void (*PresetFunc)(EnvVars& ev,
-                           size_t      const numBytesPerTransfer,
+                           size_t const numBytesPerTransfer,
                           std::string const presetName);
-std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
+std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap = {
-{
+    {"a2a", {AllToAllPreset, "Tests parallel transfers between all pairs of GPU devices"}},
-  {"a2a",         {AllToAllPreset,      "Tests parallel transfers between all pairs of GPU devices"}},
+    {"a2a_n",
-  {"a2a_n",       {AllToAllRdmaPreset,  "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
+     {AllToAllRdmaPreset,
-  {"a2asweep",    {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
+      "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA "
-  {"healthcheck", {HealthCheckPreset,   "Simple bandwidth health check (MI300X series only)"}},
+      "transfers"}},
-  {"one2all",     {OneToAllPreset,      "Test all subsets of parallel transfers from one GPU to all others"}},
+    {"a2asweep",
-  {"p2p"   ,      {PeerToPeerPreset,    " Peer-to-peer device memory bandwidth test"}},
+     {AllToAllSweepPreset,
-  {"rsweep",      {SweepPreset,         "Randomly sweep through sets of Transfers"}},
+      "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
-  {"scaling",     {ScalingPreset,       "Run scaling test from one GPU to other devices"}},
+    {"healthcheck", {HealthCheckPreset, "Simple bandwidth health check (MI300X series only)"}},
-  {"schmoo",      {SchmooPreset,        "Scaling tests for local/remote read/write/copy"}},
+    {"one2all",
-  {"sweep",       {SweepPreset,         "Ordered sweep through sets of Transfers"}},
+     {OneToAllPreset, "Test all subsets of parallel transfers from one GPU to all others"}},
+    {"p2p", {PeerToPeerPreset, " Peer-to-peer device memory bandwidth test"}},
+    {"rsweep", {SweepPreset, "Randomly sweep through sets of Transfers"}},
+    {"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}},
+    {"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}},
+    {"sweep", {SweepPreset, "Ordered sweep through sets of Transfers"}},
 };
 void DisplayPresets()
 {
-  printf("\nAvailable Preset Benchmarks:\n");
+    printf("\nAvailable Preset Benchmarks:\n");
-  printf("============================\n");
+    printf("============================\n");
-  for (auto const& x : presetFuncMap)
+    for (auto const& x : presetFuncMap) {
-    printf("   %15s - %s\n", x.first.c_str(), x.second.second.c_str());
+        printf("   %15s - %s\n", x.first.c_str(), x.second.second.c_str());
+    }
 }
-int RunPreset(EnvVars&       ev,
+int RunPreset(EnvVars& ev, size_t const numBytesPerTransfer, int const argc, char** const argv)
-              size_t   const numBytesPerTransfer,
-              int      const argc,
-              char**   const argv)
 {
-  std::string preset = (argc > 1 ? argv[1] : "");
+    std::string preset = (argc > 1 ? argv[1] : "");
-  if (presetFuncMap.count(preset)) {
+    if (presetFuncMap.count(preset)) {
-    (presetFuncMap[preset].first)(ev, numBytesPerTransfer, preset);
+        (presetFuncMap[preset].first)(ev, numBytesPerTransfer, preset);
-    return 1;
+        return 1;
-  }
+    }
-  return 0;
+    return 0;
 }
--- a/client/include/Presets/Scaling.hpp
+++ b/client/include/Presets/Scaling.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef SCALING_PRESET_HPP
+#define SCALING_PRESET_HPP
+#include "EnvVars.hpp"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+void ScalingPreset(EnvVars& ev,
+                   size_t const numBytesPerTransfer,
+                   [[maybe_unused]] std::string const presetName)
+{
+    int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    // Collect env vars for this preset
+    int localIdx      = EnvVars::GetEnvVar("LOCAL_IDX", 0);
+    int numCpuDevices = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
+    int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+    int sweepMax      = EnvVars::GetEnvVar("SWEEP_MAX", 32);
+    int sweepMin      = EnvVars::GetEnvVar("SWEEP_MIN", 1);
+    int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
+    // Display environment variables
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+        int outputToCsv = ev.outputToCsv;
+        if (!outputToCsv) { printf("[Schmoo Related]\n"); }
+        ev.Print("LOCAL_IDX", localIdx, "Local GPU index");
+        ev.Print("SWEEP_MAX", sweepMax, "Max number of subExecutors to use");
+        ev.Print("SWEEP_MIN", sweepMin, "Min number of subExecutors to use");
+        printf("\n");
+    }
+    // Validate env vars
+    if (localIdx >= numDetectedGpus) {
+        printf("[ERROR] Cannot execute scaling test with local GPU device %d\n", localIdx);
+        exit(1);
+    }
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    TransferBench::TestResults results;
+    char separator = (ev.outputToCsv ? ',' : ' ');
+    int numDevices = numCpuDevices + numGpuDevices;
+    printf("GPU-GFX Scaling benchmark:\n");
+    printf("==========================\n");
+    printf("- Copying %lu bytes from GPU %d to other devices\n", numBytesPerTransfer, localIdx);
+    printf("- All numbers reported as GB/sec\n\n");
+    printf("NumCUs");
+    for (int i = 0; i < numDevices; i++) {
+        printf("%c  %s%02d     ",
+               separator,
+               i < numCpuDevices ? "CPU" : "GPU",
+               i < numCpuDevices ? i : i - numCpuDevices);
+    }
+    printf("\n");
+    std::vector<std::pair<double, int>> bestResult(numDevices);
+    std::vector<Transfer> transfers(1);
+    Transfer& t   = transfers[0];
+    t.exeDevice   = {EXE_GPU_GFX, localIdx};
+    t.exeSubIndex = -1;
+    t.numBytes    = numBytesPerTransfer;
+    t.srcs        = {{MEM_GPU, localIdx}};
+    for (int numSubExec = sweepMin; numSubExec <= sweepMax; numSubExec++) {
+        t.numSubExecs = numSubExec;
+        printf("%4d  ", numSubExec);
+        for (int i = 0; i < numDevices; i++) {
+            t.dsts = {
+                {i < numCpuDevices ? MEM_CPU : MEM_GPU, i < numCpuDevices ? i : i - numCpuDevices}};
+            if (!RunTransfers(cfg, transfers, results)) {
+                PrintErrors(results.errResults);
+                exit(1);
+            }
+            double bw = results.tfrResults[0].avgBandwidthGbPerSec;
+            printf("%c%7.2f     ", separator, bw);
+            if (bw > bestResult[i].first) {
+                bestResult[i].first  = bw;
+                bestResult[i].second = numSubExec;
+            }
+        }
+        printf("\n");
+    }
+    printf(" Best ");
+    for (int i = 0; i < numDevices; i++) {
+        printf("%c%7.2f(%3d)", separator, bestResult[i].first, bestResult[i].second);
+    }
+    printf("\n");
+}
+#endif    // SCALING_PRESET_HPP
--- a/client/include/Presets/Schmoo.hpp
+++ b/client/include/Presets/Schmoo.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef SCHMOO_PRESET_HPP
+#define SCHMOO_PRESET_HPP
+#include "EnvVars.hpp"
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+void SchmooPreset(EnvVars& ev,
+                  size_t const numBytesPerTransfer,
+                  [[maybe_unused]] std::string const presetName)
+{
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    if (numDetectedGpus < 2) {
+        printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
+        exit(1);
+    }
+    // Collect env vars for this preset
+    int localIdx     = EnvVars::GetEnvVar("LOCAL_IDX", 0);
+    int remoteIdx    = EnvVars::GetEnvVar("REMOTE_IDX", 1);
+    int sweepMax     = EnvVars::GetEnvVar("SWEEP_MAX", 32);
+    int sweepMin     = EnvVars::GetEnvVar("SWEEP_MIN", 1);
+    int useFineGrain = EnvVars::GetEnvVar("USE_FINE_GRAIN", 0);
+    // Display environment variables
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+        int outputToCsv = ev.outputToCsv;
+        if (!outputToCsv) { printf("[Schmoo Related]\n"); }
+        ev.Print("LOCAL_IDX", localIdx, "Local GPU index");
+        ev.Print("REMOTE_IDX", remoteIdx, "Remote GPU index");
+        ev.Print("SWEEP_MAX", sweepMax, "Max number of subExecutors to use");
+        ev.Print("SWEEP_MIN", sweepMin, "Min number of subExecutors to use");
+        ev.Print("USE_FINE_GRAIN",
+                 useFineGrain,
+                 "Using %s-grained memory",
+                 useFineGrain ? "fine" : "coarse");
+        printf("\n");
+    }
+    // Validate env vars
+    if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) {
+        printf(
+            "[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n",
+            localIdx,
+            remoteIdx);
+        exit(1);
+    }
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    TransferBench::TestResults results;
+    char memChar = useFineGrain ? 'F' : 'G';
+    printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n",
+           numBytesPerTransfer,
+           localIdx,
+           remoteIdx);
+    printf(
+        "       | Local Read  | Local Write | Local Copy  | Remote Read | Remote Write| Remote "
+        "Copy |\n");
+    printf(
+        "  #CUs "
+        "|%c%02d->G%02d->N00|N00->G%02d->%c%02d|%c%02d->G%02d->%c%02d|%c%02d->G%02d->N00|N00->G%"
+        "02d->%c%02d|%c%02d->G%02d->%c%02d|\n",
+        memChar,
+        localIdx,
+        localIdx,
+        localIdx,
+        memChar,
+        localIdx,
+        memChar,
+        localIdx,
+        localIdx,
+        memChar,
+        localIdx,
+        memChar,
+        remoteIdx,
+        localIdx,
+        localIdx,
+        memChar,
+        remoteIdx,
+        memChar,
+        localIdx,
+        localIdx,
+        memChar,
+        remoteIdx);
+    printf(
+        "|------|-------------|-------------|-------------|-------------|-------------|------------"
+        "-|\n");
+    std::vector<Transfer> transfers(1);
+    Transfer& t   = transfers[0];
+    t.exeDevice   = {EXE_GPU_GFX, localIdx};
+    t.exeSubIndex = -1;
+    t.numBytes    = numBytesPerTransfer;
+    MemType memType = (useFineGrain ? MEM_GPU_FINE : MEM_GPU);
+    for (int numCUs = sweepMin; numCUs <= sweepMax; numCUs++) {
+        t.numSubExecs = numCUs;
+        // Local Read
+        t.srcs = {{memType, localIdx}};
+        t.dsts = {};
+        if (!RunTransfers(cfg, transfers, results)) {
+            PrintErrors(results.errResults);
+            exit(1);
+        }
+        double const localRead = results.tfrResults[0].avgBandwidthGbPerSec;
+        // Local Write
+        t.srcs = {};
+        t.dsts = {{memType, localIdx}};
+        if (!RunTransfers(cfg, transfers, results)) {
+            PrintErrors(results.errResults);
+            exit(1);
+        }
+        double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec;
+        // Local Copy
+        t.srcs = {{memType, localIdx}};
+        t.dsts = {{memType, localIdx}};
+        t.srcs = {};
+        t.dsts = {{memType, localIdx}};
+        if (!RunTransfers(cfg, transfers, results)) {
+            PrintErrors(results.errResults);
+            exit(1);
+        }
+        double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec;
+        // Remote Read
+        t.srcs = {{memType, remoteIdx}};
+        t.dsts = {};
+        if (!RunTransfers(cfg, transfers, results)) {
+            PrintErrors(results.errResults);
+            exit(1);
+        }
+        double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec;
+        // Remote Write
+        t.srcs = {};
+        t.dsts = {{memType, remoteIdx}};
+        if (!RunTransfers(cfg, transfers, results)) {
+            PrintErrors(results.errResults);
+            exit(1);
+        }
+        double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec;
+        // Remote Copy
+        t.srcs = {{memType, localIdx}};
+        t.dsts = {{memType, remoteIdx}};
+        if (!RunTransfers(cfg, transfers, results)) {
+            PrintErrors(results.errResults);
+            exit(1);
+        }
+        double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec;
+        printf("   %3d   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f   %11.3f  \n",
+               numCUs,
+               localRead,
+               localWrite,
+               localCopy,
+               remoteRead,
+               remoteWrite,
+               remoteCopy);
+    }
+}
+#endif    // SCHMOO_PRESET_HPP
--- a/client/include/Presets/Sweep.hpp
+++ b/client/include/Presets/Sweep.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef SWEEP_PRESET_HPP
+#define SWEEP_PRESET_HPP
+#include "EnvVars.hpp"
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+void LogTransfers(FILE* fp, int const testNum, std::vector<Transfer> const& transfers)
+{
+    if (fp) {
+        fprintf(fp, "# Test %d\n", testNum);
+        fprintf(fp, "%d", -1 * (int)transfers.size());
+        for (auto const& transfer : transfers) {
+            fprintf(fp,
+                    " (%s->%c%d->%s %d %lu)",
+                    MemDevicesToStr(transfer.srcs).c_str(),
+                    ExeTypeStr[transfer.exeDevice.exeType],
+                    transfer.exeDevice.exeIndex,
+                    MemDevicesToStr(transfer.dsts).c_str(),
+                    transfer.numSubExecs,
+                    transfer.numBytes);
+        }
+        fprintf(fp, "\n");
+        fflush(fp);
+    }
+}
+void SweepPreset(EnvVars& ev, size_t const numBytesPerTransfer, std::string const presetName)
+{
+    bool const isRandom = (presetName == "rsweep");
+    int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
+    int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+    // Collect env vars and set defaults
+    int continueOnErr     = EnvVars::GetEnvVar("CONTINUE_ON_ERROR", 0);
+    int numCpuDevices     = EnvVars::GetEnvVar("NUM_CPU_DEVICES", numDetectedCpus);
+    int numCpuSubExecs    = EnvVars::GetEnvVar("NUM_CPU_SE", 4);
+    int numGpuDevices     = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+    int numGpuSubExecs    = EnvVars::GetEnvVar("NUM_GPU_SE", 4);
+    std::string sweepDst  = EnvVars::GetEnvVar("SWEEP_DST", "CG");
+    std::string sweepExe  = EnvVars::GetEnvVar("SWEEP_EXE", "CDG");
+    std::string sweepFile = EnvVars::GetEnvVar("SWEEP_FILE", "/tmp/lastSweep.cfg");
+    int sweepMax          = EnvVars::GetEnvVar("SWEEP_MAX", 24);
+    int sweepMin          = EnvVars::GetEnvVar("SWEEP_MIN", 1);
+    int sweepRandBytes    = EnvVars::GetEnvVar("SWEEP_RAND_BYTES", 0);
+    int sweepSeed         = EnvVars::GetEnvVar("SWEEP_SEED", time(NULL));
+    std::string sweepSrc  = EnvVars::GetEnvVar("SWEEP_SRC", "CG");
+    int sweepTestLimit    = EnvVars::GetEnvVar("SWEEP_TEST_LIMIT", 0);
+    int sweepTimeLimit    = EnvVars::GetEnvVar("SWEEP_TIME_LIMIT", 0);
+    int sweepXgmiMin      = EnvVars::GetEnvVar("SWEEP_XGMI_MIN", 0);
+    int sweepXgmiMax      = EnvVars::GetEnvVar("SWEEP_XGMI_MAX", -1);
+    auto generator = new std::default_random_engine(sweepSeed);
+    // Display env var settings
+    ev.DisplayEnvVars();
+    if (!ev.hideEnv) {
+        int outputToCsv = ev.outputToCsv;
+        if (!outputToCsv) { printf("[Sweep Related]\n"); }
+        ev.Print("CONTINUE_ON_ERROR",
+                 continueOnErr,
+                 continueOnErr ? "Continue on mismatch error" : "Stop after first error");
+        ev.Print("NUM_CPU_DEVICES", numCpuDevices, "Using %d CPUs", numCpuDevices);
+        ev.Print("NUM_CPU_SE",
+                 numCpuSubExecs,
+                 "Using %d CPU threads per CPU executed Transfer",
+                 numCpuSubExecs);
+        ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Using %d GPUs", numGpuDevices);
+        ev.Print("NUM_GPU_SE",
+                 numGpuSubExecs,
+                 "Using %d subExecutors/CUs per GPU executed Transfer",
+                 numGpuSubExecs);
+        ev.Print("SWEEP_DST", sweepDst.c_str(), "Destination Memory Types to sweep");
+        ev.Print("SWEEP_EXE", sweepExe.c_str(), "Executor Types to sweep");
+        ev.Print(
+            "SWEEP_FILE", sweepFile.c_str(), "File to store the executing sweep configuration");
+        ev.Print("SWEEP_MAX", sweepMax, "Max simultaneous transfers (0 = no limit)");
+        ev.Print("SWEEP_MIN", sweepMin, "Min simultaenous transfers");
+        ev.Print("SWEEP_RAND_BYTES",
+                 sweepRandBytes,
+                 "Using %s number of bytes per Transfer",
+                 (sweepRandBytes ? "random" : "constant"));
+        ev.Print("SWEEP_SEED", sweepSeed, "Random seed set to %d", sweepSeed);
+        ev.Print("SWEEP_SRC", sweepSrc.c_str(), "Source Memory Types to sweep");
+        ev.Print("SWEEP_TEST_LIMIT",
+                 sweepTestLimit,
+                 "Max number of tests to run during sweep (0 = no limit)");
+        ev.Print("SWEEP_TIME_LIMIT",
+                 sweepTimeLimit,
+                 "Max number of seconds to run sweep for  (0 = no limit)");
+        ev.Print("SWEEP_XGMI_MAX",
+                 sweepXgmiMax,
+                 "Max number of XGMI hops for Transfers  (-1 = no limit)");
+        ev.Print("SWEEP_XGMI_MIN", sweepXgmiMin, "Min number of XGMI hops for Transfers");
+        printf("\n");
+    }
+    // Validate env vars
+    for (auto ch : sweepSrc) {
+        if (!strchr(MemTypeStr, ch)) {
+            printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
+            exit(1);
+        }
+        if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) {
+            printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
+            exit(1);
+        }
+    }
+    for (auto ch : sweepDst) {
+        if (!strchr(MemTypeStr, ch)) {
+            printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
+            exit(1);
+        }
+        if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) {
+            printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
+            exit(1);
+        }
+    }
+    for (auto ch : sweepExe) {
+        if (!strchr(ExeTypeStr, ch)) {
+            printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
+            exit(1);
+        }
+        if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) {
+            printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
+            exit(1);
+        }
+    }
+    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    TransferBench::TestResults results;
+    // Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
+    std::vector<ExeDevice> exeList;
+    for (auto exe : sweepExe) {
+        ExeType exeType;
+        CharToExeType(exe, exeType);
+        if (IsGpuExeType(exeType)) {
+            for (int exeIndex = 0; exeIndex < numGpuDevices; ++exeIndex) {
+                exeList.push_back({exeType, exeIndex});
+            }
+        } else if (IsCpuExeType(exeType)) {
+            for (int exeIndex = 0; exeIndex < numCpuDevices; ++exeIndex) {
+                // Skip NUMA nodes that have no CPUs (e.g. CXL)
+                if (TransferBench::GetNumSubExecutors({EXE_CPU, exeIndex}) == 0) { continue; }
+                exeList.push_back({exeType, exeIndex});
+            }
+        }
+    }
+    int numExes = exeList.size();
+    std::vector<MemDevice> srcList;
+    for (auto src : sweepSrc) {
+        MemType srcType;
+        CharToMemType(src, srcType);
+        int const numDevices = (srcType == MEM_NULL)   ? 1
+                               : IsGpuMemType(srcType) ? numGpuDevices
+                                                       : numCpuDevices;
+        for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex) {
+            srcList.push_back({srcType, srcIndex});
+        }
+    }
+    int numSrcs = srcList.size();
+    std::vector<MemDevice> dstList;
+    for (auto dst : sweepDst) {
+        MemType dstType;
+        CharToMemType(dst, dstType);
+        int const numDevices = (dstType == MEM_NULL)   ? 1
+                               : IsGpuMemType(dstType) ? numGpuDevices
+                                                       : numCpuDevices;
+        for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex) {
+            dstList.push_back({dstType, dstIndex});
+        }
+    }
+    int numDsts = dstList.size();
+    // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
+    struct TransferInfo
+    {
+            MemDevice srcMem;
+            ExeDevice exeDevice;
+            MemDevice dstMem;
+    };
+    // If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links
+    // must be XGMI
+    bool const useXgmiOnly = (sweepXgmiMin > 0 || sweepXgmiMax > 0);
+    std::vector<TransferInfo> possibleTransfers;
+    TransferInfo tinfo;
+    for (int i = 0; i < numExes; ++i) {
+        // Skip CPU executors if XGMI link must be used
+        if (useXgmiOnly && !IsGpuExeType(exeList[i].exeType)) { continue; }
+        tinfo.exeDevice = exeList[i];
+        bool isXgmiSrc = false;
+        int numHopsSrc = 0;
+        for (int j = 0; j < numSrcs; ++j) {
+            if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(srcList[j].memType)) {
+                if (exeList[i].exeIndex != srcList[j].memIndex) {
+#if defined(__NVCC__)
+                    isXgmiSrc = false;
+#else
+                    uint32_t exeToSrcLinkType, exeToSrcHopCount;
+                    HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
+                                                          srcList[j].memIndex,
+                                                          &exeToSrcLinkType,
+                                                          &exeToSrcHopCount));
+                    isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+                    if (isXgmiSrc) { numHopsSrc = exeToSrcHopCount; }
+#endif
+                } else {
+                    isXgmiSrc  = true;
+                    numHopsSrc = 0;
+                }
+                // Skip this SRC if it is not XGMI but only XGMI links may be used
+                if (useXgmiOnly && !isXgmiSrc) { continue; }
+                // Skip this SRC if XGMI distance is already past limit
+                if (sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > sweepXgmiMax) { continue; }
+            } else if (srcList[j].memType != MEM_NULL && useXgmiOnly) {
+                continue;
+            }
+            tinfo.srcMem = srcList[j];
+            bool isXgmiDst = false;
+            int numHopsDst = 0;
+            for (int k = 0; k < numDsts; ++k) {
+                if (IsGpuExeType(exeList[i].exeType) && IsGpuMemType(dstList[k].memType)) {
+                    if (exeList[i].exeIndex != dstList[k].memIndex) {
+#if defined(__NVCC__)
+                        isXgmiSrc = false;
+#else
+                        uint32_t exeToDstLinkType, exeToDstHopCount;
+                        HIP_CALL(hipExtGetLinkTypeAndHopCount(exeList[i].exeIndex,
+                                                              dstList[k].memIndex,
+                                                              &exeToDstLinkType,
+                                                              &exeToDstHopCount));
+                        isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
+                        if (isXgmiDst) { numHopsDst = exeToDstHopCount; }
+#endif
+                    } else {
+                        isXgmiDst  = true;
+                        numHopsDst = 0;
+                    }
+                }
+                // Skip this DST if it is not XGMI but only XGMI links may be used
+                if (dstList[k].memType != MEM_NULL && useXgmiOnly && !isXgmiDst) { continue; }
+                // Skip this DST if total XGMI distance (SRC + DST) is less than min limit
+                if (sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < sweepXgmiMin)) { continue; }
+                // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
+                if (sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > sweepXgmiMax) { continue; }
+#if defined(__NVCC__)
+                // Skip CPU executors on GPU memory on NVIDIA platform
+                if (IsCpuExeType(exeList[i].exeType) &&
+                    (IsGpuMemType(dstList[j].memType) || IsGpuMemType(dstList[k].memType))) {
+                    continue;
+                }
+#endif
+                tinfo.dstMem = dstList[k];
+                // Skip if there is no src and dst
+                if (tinfo.srcMem.memType == MEM_NULL && tinfo.dstMem.memType == MEM_NULL) {
+                    continue;
+                }
+                possibleTransfers.push_back(tinfo);
+            }
+        }
+    }
+    int const numPossible    = (int)possibleTransfers.size();
+    int maxParallelTransfers = (sweepMax == 0 ? numPossible : sweepMax);
+    if (sweepMin > numPossible) {
+        printf("No valid test configurations exist\n");
+        return;
+    }
+    if (ev.outputToCsv) {
+        printf(
+            "\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
+            "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
+    }
+    int numTestsRun = 0;
+    int M           = sweepMin;
+    std::uniform_int_distribution<int> randSize(1, numBytesPerTransfer / sizeof(float));
+    std::uniform_int_distribution<int> distribution(sweepMin, maxParallelTransfers);
+    // Log sweep to configuration file
+    char absPath[1024];
+    auto const res = realpath(sweepFile.c_str(), absPath);
+    FILE* fp = fopen(sweepFile.c_str(), "w");
+    if (!fp) {
+        printf("[WARN] Unable to open %s.  Skipping output of sweep configuration file\n",
+               res ? absPath : sweepFile.c_str());
+    } else {
+        printf("Sweep configuration saved to: %s\n", res ? absPath : sweepFile.c_str());
+    }
+    // Create bitmask of numPossible triplets, of which M will be chosen
+    std::string bitmask(M, 1);
+    bitmask.resize(numPossible, 0);
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    while (1) {
+        if (isRandom) {
+            // Pick random number of simultaneous transfers to execute
+            // NOTE: This currently skews distribution due to some #s having more possibilities than
+            // others
+            M = distribution(*generator);
+            // Generate a random bitmask
+            for (int i = 0; i < numPossible; i++) { bitmask[i] = (i < M) ? 1 : 0; }
+            std::shuffle(bitmask.begin(), bitmask.end(), *generator);
+        }
+        // Convert bitmask to list of Transfers
+        std::vector<Transfer> transfers;
+        for (int value = 0; value < numPossible; ++value) {
+            if (bitmask[value]) {
+                // Convert integer value to (SRC->EXE->DST) triplet
+                Transfer transfer;
+                if (possibleTransfers[value].srcMem.memType != MEM_NULL) {
+                    transfer.srcs.push_back(possibleTransfers[value].srcMem);
+                }
+                transfer.exeDevice = possibleTransfers[value].exeDevice;
+                if (possibleTransfers[value].dstMem.memType != MEM_NULL) {
+                    transfer.dsts.push_back(possibleTransfers[value].dstMem);
+                }
+                transfer.exeSubIndex = -1;
+                transfer.numSubExecs = IsGpuExeType(transfer.exeDevice.exeType) ? numGpuSubExecs
+                                                                                : numCpuSubExecs;
+                transfer.numBytes    = sweepRandBytes ? randSize(*generator) * sizeof(float)
+                                                      : numBytesPerTransfer;
+                transfers.push_back(transfer);
+            }
+        }
+        LogTransfers(fp, ++numTestsRun, transfers);
+        if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+            PrintErrors(results.errResults);
+            if (!continueOnErr) { exit(1); }
+        } else {
+            PrintResults(ev, numTestsRun, transfers, results);
+        }
+        // Check for test limit
+        if (numTestsRun == sweepTestLimit) {
+            printf("Sweep Test limit reached\n");
+            break;
+        }
+        // Check for time limit
+        auto cpuDelta       = std::chrono::high_resolution_clock::now() - cpuStart;
+        double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta)
+                                  .count();
+        if (sweepTimeLimit && totalCpuTime > sweepTimeLimit) {
+            printf("Sweep Time limit exceeded\n");
+            break;
+        }
+        // Increment bitmask if not random sweep
+        if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end())) {
+            M++;
+            // Check for completion
+            if (M > maxParallelTransfers) {
+                printf("Sweep complete\n");
+                break;
+            }
+            for (int i = 0; i < numPossible; i++) { bitmask[i] = (i < M) ? 1 : 0; }
+        }
+    }
+    if (fp) { fclose(fp); }
+}
+#endif    // SWEEP_PRESET_HPP