TransferBench v1.63 (#193)

* Fixing issue with P memory type and use of DMA subexecutor * CMake builds require explicit opt-in by setting NIC_EXEC_ENABLE=1 * Removing self-GPU check for DMA engine copies * [BUILD] Add new GPU targets and switch to amdclang++ (#187) * [BUILD] Add gfx950, gfx1150, and gfx1151 targets * [BUILD] Modify CMake to use amdclang++ * [BUILD] Modify Makefile to use amdclang++ * [GIT] Updated CHANGELOG and .gitignore * Adding HBM testing to healthcheck preset * Tweaking HBM tests to occur first, and provide more info during VERBOSE=1 * Fixing timing reporting issues with NUM_SUBITERATIONS * [BUILD] Simplify Makefile (#190) * Combines steps for compilation and linking * Does not rebuild if no change to source code * Updating CHANGELOG --------- Co-authored-by: Nilesh M Negi <Nilesh.Negi@amd.com>

TransferBench v1.63 (#193)
* Fixing issue with P memory type and use of DMA subexecutor * CMake builds require explicit opt-in by setting NIC_EXEC_ENABLE=1 * Removing self-GPU check for DMA engine copies * [BUILD] Add new GPU targets and switch to amdclang++ (#187) * [BUILD] Add gfx950, gfx1150, and gfx1151 targets * [BUILD] Modify CMake to use amdclang++ * [BUILD] Modify Makefile to use amdclang++ * [GIT] Updated CHANGELOG and .gitignore * Adding HBM testing to healthcheck preset * Tweaking HBM tests to occur first, and provide more info during VERBOSE=1 * Fixing timing reporting issues with NUM_SUBITERATIONS * [BUILD] Simplify Makefile (#190) * Combines steps for compilation and linking * Does not rebuild if no change to source code * Updating CHANGELOG --------- Co-authored-by: Nilesh M Negi <Nilesh.Negi@amd.com>
023ce416 · gilbertlee-amd · GitHub · a4709f4b · 023ce416 · 023ce416
Unverified Commit 023ce416 authored Aug 08, 2025 by gilbertlee-amd Committed by GitHub Aug 08, 2025
9 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,4 @@ _templates/
 _toc.yml
 docBin/
 TransferBench
+*.o
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,19 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.63.00
+### Added
+- Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds
+
+### Modified
+- Removing self-GPU check for DMA engine copies
+- Switched to amdclang++ as primary compiler
+- healthcheck preset adds HBM testing and support for more MI3XX variants
+
+### Fixed
+- Fixed issue when using "P" memory type and specific DMA subengines
+- Fixed issue with subiteration timing reports
+
 ## v1.62.00
 ### Added
 - Adding GFX_TEMPORAL to allow for use for use of non-temporal loads/stores

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 # Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.

-if (DEFINED ENV{ROCM_PATH})
-    set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
-else()
-    set(ROCM_PATH "/opt/rocm" CACHE STRING "ROCm install directory")
+cmake_minimum_required(VERSION 3.5 FATAL_ERROR)
+
+# CMake Toolchain file to define compilers and path to ROCm
+#==================================================================================================
+if (NOT CMAKE_TOOLCHAIN_FILE)
+  set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_SOURCE_DIR}/toolchain-linux.cmake")
+  message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
 endif()
-cmake_minimum_required(VERSION 3.5)

-project(TransferBench VERSION 1.62.00 LANGUAGES CXX)
+set(VERSION_STRING "1.63.00")
+project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
+
+## Load CMake modules
+#==================================================================================================
+include(CheckIncludeFiles)
+include(CheckSymbolExists)
+include(cmake/Dependencies.cmake) # rocm-cmake, rocm_local_targets
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+
+# Build options
+#==================================================================================================
+option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
+option(ENABLE_NIC_EXEC             "Enable RDMA NIC Executor in TransferBench"    OFF)

 # Default GPU architectures to build
 #==================================================================================================
@@ -16,15 +32,18 @@ set(DEFAULT_GPUS
      gfx908
      gfx90a
      gfx942
+      gfx950
      gfx1030
      gfx1100
      gfx1101
      gfx1102
+      gfx1150
+      gfx1151
      gfx1200
      gfx1201)

-# Build only for local GPU architecture
-if (BUILD_LOCAL_GPU_TARGET_ONLY)
+## Build only for local GPU architecture
+if(BUILD_LOCAL_GPU_TARGET_ONLY)
  message(STATUS "Building only for local GPU target")
  if (COMMAND rocm_local_targets)
    rocm_local_targets(DEFAULT_GPUS)
@@ -33,10 +52,10 @@ if (BUILD_LOCAL_GPU_TARGET_ONLY)
  endif()
 endif()

-# Determine which GPU architectures to build for
+## Determine which GPU architectures to build for
 set(GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if GPU_TARGETS is not defined.")

-# Check if clang compiler can offload to GPU_TARGETS
+## Check if clang compiler can offload to GPU_TARGETS
 if (COMMAND rocm_check_target_ids)
  message(STATUS "Checking for ROCm support for GPU targets: " "${GPU_TARGETS}")
  rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${GPU_TARGETS})
@@ -45,50 +64,124 @@ else()
  set(SUPPORTED_GPUS ${DEFAULT_GPUS})
 endif()

-set(COMPILING_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU targets to compile for.")
-message(STATUS "Compiling for ${COMPILING_TARGETS}")
+set(GPU_TARGETS "${SUPPORTED_GPUS}")
+message(STATUS "Compiling for ${GPU_TARGETS}")

-foreach(target ${COMPILING_TARGETS})
- list(APPEND static_link_flags --offload-arch=${target})
-endforeach()
-list(JOIN static_link_flags " " flags_str)
-set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
+## NOTE: Reload rocm-cmake in order to update GPU_TARGETS
+include(cmake/Dependencies.cmake) # Reloading to use desired GPU_TARGETS instead of defaults

-set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
-include_directories(${ROCM_PATH}/include)
-find_library(IBVERBS_LIBRARY ibverbs)
-find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
-if (DEFINED ENV{DISABLE_NIC_EXEC})
-  message(STATUS "Disabling NIC Executor support")
-elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
-  message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
-  add_definitions(-DNIC_EXEC_ENABLED)
-  link_libraries(ibverbs)
+# Check for required dependencies
+#==================================================================================================
+## Try to establish ROCM_PATH (for find_package)
+if(NOT DEFINED ROCM_PATH)
+  # Guess default location
+  set(ROCM_PATH "/opt/rocm")
+  message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
 else()
-  if (NOT IBVERBS_LIBRARY)
+  message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
+endif()
+set(ENV{ROCM_PATH} ${ROCM_PATH})
+
+## Set CMAKE flags
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+list(APPEND CMAKE_PREFIX_PATH  # Add ROCM_PATH to CMake search paths for finding HIP / HSA
+            ${ROCM_PATH}
+            ${ROCM_PATH}/llvm
+            ${ROCM_PATH}/hip
+            /opt/rocm
+            /opt/rocm/llvm
+            /opt/rocm/hip)
+
+## Check for HIP
+find_package(hip REQUIRED CONFIG PATHS ${CMAKE_PREFIX_PATH})
+message(STATUS "HIP compiler: ${HIP_COMPILER}")
+
+## Ensuring that CXX compiler meets expectations
+if(NOT (("${CMAKE_CXX_COMPILER}" MATCHES ".*hipcc") OR ("${CMAKE_CXX_COMPILER}" MATCHES ".*clang\\+\\+")))
+  message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.")
+endif()
+
+## Check for Threads
+find_package(Threads REQUIRED)
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+
+## Check for numa support
+find_library(NUMA_LIBRARY numa)
+find_path(NUMA_INCLUDE_DIR numa.h)
+if(NUMA_LIBRARY AND NUMA_INCLUDE_DIR)
+  add_library(numa SHARED IMPORTED)
+  set_target_properties(numa PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}" IMPORTED_LOCATION "${NUMA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${NUMA_INCLUDE_DIR}")
+endif()
+
+## Check for hsa support
+find_library(HSA_LIBRARY hsa-runtime64 PATHS ${ROCM_PATH} ${ROCM_PATH}/lib)
+find_path(HSA_INCLUDE_DIR hsa.h PATHS ${ROCM_PATH}/include ${ROCM_PATH}/include/hsa)
+if(HSA_LIBRARY AND HSA_INCLUDE_DIR)
+  add_library(hsa-runtime64 SHARED IMPORTED)
+  set_target_properties(hsa-runtime64 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}" IMPORTED_LOCATION "${HSA_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HSA_INCLUDE_DIR}")
+endif()
+
+## Check for infiniband verbs support
+if(DEFINED ENV{ENABLE_NIC_EXEC} OR DEFINED ENABLE_NIC_EXEC)
+  message(STATUS "For CMake builds, NIC executor also requires explicit opt-in by setting CMake flag -DENABLE_NIC_EXEC=1 or environment flag ENABLE_NIC_EXEC=1")
+  find_library(IBVERBS_LIBRARY ibverbs)
+  find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
+  if(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
+    add_library(ibverbs SHARED IMPORTED)
+    set_target_properties(ibverbs PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}" IMPORTED_LOCATION "${IBVERBS_LIBRARY}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${IBVERBS_INCLUDE_DIR}")
+    set(IBVERBS_FOUND 1)
+    message(STATUS "Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
+  else()
+    if(NOT IBVERBS_LIBRARY)
      message(WARNING "IBVerbs library not found")
-  elseif (NOT IBVERBS_INCLUDE_DIR)
+    elseif(NOT IBVERBS_INCLUDE_DIR)
      message(WARNING "infiniband/verbs.h not found")
    endif()
    message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
+  endif()
+else()
+  message(STATUS "Disabling NIC Executor support")
+  message(WARNING "For CMake builds, NIC executor requires explicit opt-in by setting ENABLE_NIC_EXEC=1")
 endif()

-link_libraries(numa hsa-runtime64 pthread)
-set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
+
 add_executable(TransferBench src/client/Client.cpp)
-target_include_directories(TransferBench PRIVATE src/header src/client src/client/Presets)

-find_package(ROCM 0.8 REQUIRED PATHS ${ROCM_PATH})
-include(ROCMInstallTargets)
-include(ROCMCreatePackage)
-set(ROCMCHECKS_WARN_TOOLCHAIN_VAR OFF)
+target_include_directories(TransferBench PRIVATE src/header)
+target_include_directories(TransferBench PRIVATE src/client)
+target_include_directories(TransferBench PRIVATE src/client/Presets)
+target_include_directories(TransferBench PRIVATE ${NUMA_INCLUDE_DIR})
+target_include_directories(TransferBench PRIVATE ${HSA_INCLUDE_DIR})
+if(IBVERBS_FOUND)
+  target_include_directories(TransferBench PRIVATE ${IBVERBS_INCLUDE_DIR})
+  target_link_libraries(TransferBench PRIVATE ${IBVERBS_LIBRARY})
+  target_compile_definitions(TransferBench PRIVATE NIC_EXEC_ENABLED)
+endif()

-set(PACKAGE_NAME TB)
-set(LIBRARY_NAME TransferBench)
+target_link_libraries(TransferBench PRIVATE -fgpu-rdc)             # Required when linking relocatable device code
+target_link_libraries(TransferBench PRIVATE Threads::Threads)
+target_link_libraries(TransferBench INTERFACE hip::host)
+target_link_libraries(TransferBench PRIVATE hip::device)
+target_link_libraries(TransferBench PRIVATE dl)
+target_link_libraries(TransferBench PRIVATE ${NUMA_LIBRARY})
+target_link_libraries(TransferBench PRIVATE ${HSA_LIBRARY})

 rocm_install(TARGETS TransferBench COMPONENT devel)
+rocm_setup_version(VERSION ${VERSION_STRING})

-rocm_package_add_dependencies(DEPENDS numactl hsa-rocr)
+# Package specific CPACK vars
+rocm_package_add_dependencies(DEPENDS "numactl" "hsa-rocr")
+
+set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md")
+set(CPACK_RPM_PACKAGE_LICENSE "MIT")
+
+set(PACKAGE_NAME TB)
+set(LIBRARY_NAME TransferBench)

 rocm_create_package(
  NAME ${LIBRARY_NAME}

--- a/Makefile
+++ b/Makefile
@@ -6,57 +6,81 @@
 ROCM_PATH ?= /opt/rocm
 CUDA_PATH ?= /usr/local/cuda

-HIPCC=$(ROCM_PATH)/bin/hipcc
-NVCC=$(CUDA_PATH)/bin/nvcc
+HIPCC ?= $(ROCM_PATH)/bin/amdclang++
+NVCC ?= $(CUDA_PATH)/bin/nvcc

-# Compile TransferBenchCuda if nvcc detected
-ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
+# This can be a space separated string of multiple GPU targets
+# Default is the native GPU target
+GPU_TARGETS ?= native
+
+DEBUG ?= 0
+
+ifeq ($(filter clean,$(MAKECMDGOALS)),)
+  # Compile TransferBenchCuda if nvcc detected
+  ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
    EXE=TransferBenchCuda
    CXX=$(NVCC)
-else
+  else
    EXE=TransferBench
+    ifeq ("$(shell test -e $(HIPCC) && echo found)", "found")
      CXX=$(HIPCC)
-endif
+    else ifeq ("$(shell test -e $(ROCM_PATH)/bin/hipcc && echo found)", "found")
+      CXX=$(ROCM_PATH)/bin/hipcc
+      $(warning "Could not find $(HIPCC). Using fallback to $(CXX)")
+    else
+      $(error "Could not find $(HIPCC) or $(ROCM_PATH)/bin/hipcc. Check if the path is correct if you want to build $(EXE)")
+    endif
+    GPU_TARGETS_FLAGS = $(foreach target,$(GPU_TARGETS),"--offload-arch=$(target)")
+  endif
+
+  CXXFLAGS = -I$(ROCM_PATH)/include -I$(ROCM_PATH)/include/hip -I$(ROCM_PATH)/include/hsa
+  HIPLDFLAGS= -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 -lamdhip64
+  HIPFLAGS = -x hip -D__HIP_PLATFORM_AMD__ -D__HIPCC__ $(GPU_TARGETS_FLAGS)
+  NVFLAGS  = -x cu -lnuma -arch=native
+
+  ifeq ($(DEBUG), 0)
+    COMMON_FLAGS += -O3
+  else
+    COMMON_FLAGS += -O0 -g -ggdb3
+  endif
+  COMMON_FLAGS += -I./src/header -I./src/client -I./src/client/Presets

-CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
-NVFLAGS  = -x cu -lnuma -arch=native
-COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
-LDFLAGS += -lpthread
-
-# Compile RDMA executor if
-# 1) DISABLE_NIC_EXEC is not set to 1
-# 2) IBVerbs is found in the Dynamic Linker cache
-# 3) infiniband/verbs.h is found in the default include path
-NIC_ENABLED = 0
-ifneq ($(DISABLE_NIC_EXEC),1)
+  LDFLAGS += -lpthread
+
+  # Compile RDMA executor if
+  # 1) DISABLE_NIC_EXEC is not set to 1
+  # 2) IBVerbs is found in the Dynamic Linker cache
+  # 3) infiniband/verbs.h is found in the default include path
+  DISABLE_NIC_EXEC ?= 0
+  ifneq ($(DISABLE_NIC_EXEC),1)
    ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
      $(info lib IBVerbs not found)
    else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
      $(info infiniband/verbs.h not found)
    else
-    LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
-    NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
+      CXXFLAGS += -DNIC_EXEC_ENABLED
+      LDFLAGS += -libverbs
      NIC_ENABLED = 1
    endif
    ifeq ($(NIC_ENABLED), 0)
+      $(info Building without NIC executor support)
      $(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
+    else
+      $(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
+    endif
  endif
 endif

+.PHONY : all clean
+
 all: $(EXE)

-TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
-	$(HIPCC) $(CXXFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)
+TransferBench: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) $(COMMON_FLAGS) $< -o $@ $(HIPLDFLAGS) $(LDFLAGS)

-TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") NicStatus
+TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
 	$(NVCC) $(NVFLAGS) $(COMMON_FLAGS) $< -o $@ $(LDFLAGS)

 clean:
-	rm -f *.o ./TransferBench ./TransferBenchCuda
+	rm -f ./TransferBench ./TransferBenchCuda

-NicStatus:
-  ifeq ($(NIC_ENABLED), 1)
-		$(info Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable)
-  else
-		$(info Building without NIC executor support)
-  endif
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+
+# Test dependencies
+include(FetchContent)
+
+set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "")
+
+# Find or download/install rocm-cmake project
+find_package(ROCmCMakeBuildTools 0.11.0 CONFIG QUIET PATHS "${ROCM_PATH}")
+if((NOT ROCmCMakeBuildTools_FOUND) OR INSTALL_DEPENDENCIES)
+  message(STATUS "ROCmCMakeBuildTools not found. Checking for ROCM (deprecated)")
+  find_package(ROCM 0.7.3 CONFIG QUIET PATHS "${ROCM_PATH}") # deprecated fallback
+  if((NOT ROCM_FOUND) OR INSTALL_DEPENDENCIES)
+    message(STATUS "ROCM (deprecated) not found. Downloading and building ROCmCMakeBuildTools")
+    set(PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern)
+    set(rocm_cmake_tag "rocm-6.4.0" CACHE STRING "rocm-cmake tag to download")
+    FetchContent_Declare(
+      rocm-cmake
+      GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git
+      GIT_TAG ${rocm_cmake_tag}
+      SOURCE_SUBDIR "DISABLE ADDING TO BUILD"
+    )
+    FetchContent_MakeAvailable(rocm-cmake)
+    message(STATUS "rocm-cmake_SOURCE_DIR: ${rocm-cmake_SOURCE_DIR}")
+    find_package(ROCmCMakeBuildTools CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}")
+    message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
+  endif()
+elseif(ROCmCMakeBuildTools_FOUND)
+  message(STATUS "Found ROCmCmakeBuildTools version: ${ROCmCMakeBuildTools_VERSION}")
+endif()
+
+# Find available local ROCM targets
+# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
+function(rocm_local_targets VARIABLE)
+  set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
+  find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
+  if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
+    execute_process(
+      COMMAND "${_rocm_agent_enumerator}"
+      RESULT_VARIABLE _found_agents
+      OUTPUT_VARIABLE _rocm_agents
+      ERROR_QUIET
+      )
+    if (_found_agents EQUAL 0)
+      string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
+      unset(result)
+      foreach (agent IN LISTS _rocm_agents)
+        if (NOT agent STREQUAL "gfx000")
+          list(APPEND result "${agent}")
+        endif()
+      endforeach()
+      if(result)
+        list(REMOVE_DUPLICATES result)
+        set(${VARIABLE} "${result}" PARENT_SCOPE)
+      endif()
+    endif()
+  endif()
+endfunction()
+
+include(ROCMSetupVersion)
+include(ROCMCreatePackage)
+include(ROCMInstallTargets)
+include(ROCMPackageConfigHelpers)
+include(ROCMInstallSymlinks)
+include(ROCMCheckTargetIds)
+include(ROCMClients)
+include(ROCMHeaderWrapper)
--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
@@ -131,9 +131,8 @@ public:
    int defaultGfxUnroll = 4;
    if      (archName == "gfx906") defaultGfxUnroll = 8;
    else if (archName == "gfx90a") defaultGfxUnroll = 8;
-    else if (archName == "gfx940") defaultGfxUnroll = 6;
-    else if (archName == "gfx941") defaultGfxUnroll = 6;
    else if (archName == "gfx942") defaultGfxUnroll = 4;
+    else if (archName == "gfx950") defaultGfxUnroll = 4;

    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);

--- a/src/client/Presets/HealthCheck.hpp
+++ b/src/client/Presets/HealthCheck.hpp
 /*
-Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -20,57 +20,150 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */

+enum {
+  HBM_READ      = 0,
+  HBM_WRITE     = 1,
+  HBM_COPY      = 2,
+  HBM_ADD       = 3,
+  NUM_HBM_TESTS = 4
+} HbmTests;

-void HealthCheckPreset(EnvVars&           ev,
-                       size_t      const  numBytesPerTransfer,
-                       std::string const  presetName)
+struct HbmTestConfig
 {
-  // Check for supported platforms
-#if defined(__NVCC__)
-  printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
-  return;
-#endif
-
-  bool hasFail = false;
-
-  // Force use of single stream
-  ev.useSingleStream = 1;
+  std::string name;
+  int numInputs;
+  int numOutputs;
+};
+
+HbmTestConfig HbmTestConfigs[NUM_HBM_TESTS] =
+{ {"READ",  1, 0},
+  {"WRITE", 0, 1},
+  {"COPY",  1, 1},
+  {"ADD",   2, 1}
+};
+
+typedef struct
+{
+  double unidirHostToDeviceCopyLimit;
+  double unidirDeviceToHostCopyLimit;
+  double bidirDmaCopyLimit;
+  int    a2aUnrollFactor;
+  int    a2aNumSubExecs;
+  double a2aCopyLimit;
+
+  int    hbmBlockSize   [NUM_HBM_TESTS];
+  int    hbmUnrollFactor[NUM_HBM_TESTS];
+  int    hbmTemporalMode[NUM_HBM_TESTS];
+  double hbmLimit       [NUM_HBM_TESTS];
+} TestConfig;
+
+typedef enum
+{
+  MODEL_08_GFX0942_304 = 0,
+  MODEL_08_GFX0942_064 = 1,
+  NUM_SUPPORTED_MODELS = 2
+} ModelEnum;
+
+// All limits are scaled by this factor
+#define SFACTOR 0.97
+
+TestConfig Config_08_GFX0942_304 = {
+  .unidirHostToDeviceCopyLimit   = 50,
+  .unidirDeviceToHostCopyLimit   = 50,
+  .bidirDmaCopyLimit             = 90,
+  .a2aUnrollFactor               = 2,
+  .a2aNumSubExecs                = 8,
+  .a2aCopyLimit                  = 45,
+  .hbmBlockSize                  = { 384,  256,  320,  256},
+  .hbmUnrollFactor               = {   7,    4,    8,    7},
+  .hbmTemporalMode               = {   3,    3,    3,    3},
+  .hbmLimit                      = {4980, 4850, 2045, 1405},
+};
+
+TestConfig Config_08_GFX0942_064 = {
+  .unidirHostToDeviceCopyLimit   = 50,
+  .unidirDeviceToHostCopyLimit   = 50,
+  .bidirDmaCopyLimit             = 90,
+  .a2aUnrollFactor               = 2,
+  .a2aNumSubExecs                = 8,
+  .a2aCopyLimit                  = 45,
+  .hbmBlockSize                  = { 448,  448,  448,  384},
+  .hbmUnrollFactor               = {   8,    3,    8,    7},
+  .hbmTemporalMode               = {   3,    3,    3,    3},
+  .hbmLimit                      = {4180, 2800, 1400, 1055},
+};
+
+TestConfig TestConfigs[NUM_SUPPORTED_MODELS] =
+{
+  Config_08_GFX0942_304,
+  Config_08_GFX0942_064,
+};

-  TransferBench::TestResults results;
+int DetectModel()
+{
  int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);

-  if (numGpuDevices != 8) {
-    printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
-    exit(1);
-  }
+  std::string archName = "";
+  int numSubExecutors = 0;

+  // Loop over all GPUs and determine if they are identical
  for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+    // Check that arch name is identical
    hipDeviceProp_t prop;
    HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
    std::string fullName = prop.gcnArchName;
-    std::string archName = fullName.substr(0, fullName.find(':'));
-    if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
-    {
-      printf("[WARN] healthcheck preset is currently only supported on 8-GPU MI300X hardware\n");
+    std::string currArchName = fullName.substr(0, fullName.find(':'));
+    if (archName != "" && archName != currArchName) {
+      printf("[WARN] healthcheck preset is currently only supported when all GPUs are identical\n");
+      printf("       Detected both %s and %s\n", archName.c_str(), currArchName.c_str());
+      exit(1);
+    }
+    archName = currArchName;
+
+    // Check number of subexecutors
+    int currNumSubExecutors = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
+    if (numSubExecutors != 0 && numSubExecutors != currNumSubExecutors) {
+      printf("[WARN] healthcheck preset is currently only supported when all GPUs are identical\n");
+      printf("       Detected different subexecutor counts: %d and %d\n", numSubExecutors, currNumSubExecutors);
      exit(1);
    }
+    numSubExecutors = currNumSubExecutors;
  }

-  // Pass limits
-  double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
-  double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
-  double a2aLimit  = getenv("LIMIT_A2A")  ? atof(getenv("LIMIT_A2A"))  : (int)(45 * 0.95);
+  // Classify based on detected configuration
+  if (numGpuDevices == 8) {
+    if (archName == "gfx942") {
+      switch (numSubExecutors) {
+      case 304: return MODEL_08_GFX0942_304;
+      case 64:  return MODEL_08_GFX0942_064;
+      }
+    }
+  }
+
+  printf("[WARN] healthcheck preset is currently not supported on this hardware\n");
+  printf("       Detected %d x [%s] with [%d] subexecutors per GPU\n", numGpuDevices, archName.c_str(), numSubExecutors);
+  exit(1);
+}
+
+int TestUnidir(int modelId, bool verbose)
+{
+  TestConfig const& testConfig = TestConfigs[modelId];
+  TransferBench::ConfigOptions cfg;
+  TransferBench::TestResults results;

-  // Run CPU to GPU
+  int hasFail = 0;
+  int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+  cfg.dma.useHsaCopy = 1;

-  // Run unidirectional read from CPU to GPU
-  printf("Testing unidirectional reads from CPU ");
+  // Run unidirectional host to device copy
+  printf("Testing unidirectional host to device copy%c", verbose ? '\n' : ' ');
  {
-    ev.gfxUnroll = 4;
-    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    double limit = testConfig.unidirHostToDeviceCopyLimit * SFACTOR;
+
    std::vector<std::pair<int, double>> fails;
    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
+      if (!verbose) printf(".");
+      fflush(stdout);

      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
      if (memIndex == -1) {
@@ -80,48 +173,43 @@ void HealthCheckPreset(EnvVars&           ev,

      std::vector<Transfer> transfers(1);
      Transfer& t   = transfers[0];
-      t.exeDevice = {EXE_GPU_GFX, gpuId};
-      t.numBytes  = 64*1024*1024;
+      t.exeDevice   = {EXE_GPU_DMA, gpuId};
+      t.numBytes    = 256*1024*1024;
      t.srcs        = {{MEM_CPU, memIndex}};
-      t.dsts      = {};
+      t.dsts        = {{MEM_GPU, gpuId}};
+      t.numSubExecs = 1;

-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t.numSubExecs = cu;
      if (TransferBench::RunTransfers(cfg, transfers, results)) {
-          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
+        double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
+        if (measuredBw < limit) {
+          fails.push_back(std::make_pair(gpuId, measuredBw));
+        }
+        if (verbose) printf("   GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
      } else {
        PrintErrors(results.errResults);
      }
-        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
-          passed = true;
-          break;
-        }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
    }
+
    if (fails.size() == 0) {
      printf("PASS\n");
    } else {
-      hasFail = true;
+      hasFail = 1;
      printf("FAIL (%lu test(s))\n", fails.size());
      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, limit);
      }
    }
  }

-  // Run unidirectional write from GPU to CPU
-  printf("Testing unidirectional writes to  CPU ");
+  // Run unidirectional device to host copy
+  printf("Testing unidirectional device to host copy%c", verbose ? '\n' : ' ');
  {
-    ev.gfxUnroll = 4;
-    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    double limit = testConfig.unidirDeviceToHostCopyLimit * SFACTOR;

    std::vector<std::pair<int, double>> fails;
    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
+      if (!verbose) printf(".");
+      fflush(stdout);

      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
      if (memIndex == -1) {
@@ -131,48 +219,53 @@ void HealthCheckPreset(EnvVars&           ev,

      std::vector<Transfer> transfers(1);
      Transfer& t   = transfers[0];
-      t.exeDevice = {EXE_GPU_GFX, gpuId};
-      t.numBytes  = 64*1024*1024;
-      t.srcs      = {};
+      t.exeDevice   = {EXE_GPU_DMA, gpuId};
+      t.numBytes    = 256*1024*1024;
+      t.srcs        = {{MEM_GPU, gpuId}};
      t.dsts        = {{MEM_CPU, memIndex}};
+      t.numSubExecs = 1;

-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t.numSubExecs = cu;
      if (TransferBench::RunTransfers(cfg, transfers, results)) {
-          bestResult = std::max(bestResult, results.tfrResults[0].avgBandwidthGbPerSec);
+        double measuredBw = results.tfrResults[0].avgBandwidthGbPerSec;
+        if (measuredBw < limit) {
+          fails.push_back(std::make_pair(gpuId, measuredBw));
+        }
+        if (verbose) printf("   GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
      } else {
        PrintErrors(results.errResults);
      }
-        if (results.tfrResults[0].avgBandwidthGbPerSec >= udirLimit) {
-          passed = true;
-          break;
-        }
-      }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
    }
+
    if (fails.size() == 0) {
      printf("PASS\n");
    } else {
-      hasFail = true;
+      hasFail = 1;
      printf("FAIL (%lu test(s))\n", fails.size());
      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, limit);
      }
    }
  }
+  return hasFail;
+}
+
+int TestBidir(int modelId, bool verbose)
+{
+  TestConfig const& testConfig = TestConfigs[modelId];
+  TransferBench::ConfigOptions cfg;
+
+
+  int hasFail = 0;
+  int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);

-  // Run bidirectional tests
-  printf("Testing bidirectional  reads + writes ");
+  printf("Testing bidirectional host<->device copies%c", verbose ? '\n' : ' ');
  {
-    ev.gfxUnroll = 4;
-    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    double limit = testConfig.bidirDmaCopyLimit * SFACTOR;

    std::vector<std::pair<int, double>> fails;
    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
-      printf("."); fflush(stdout);
+      if (!verbose)  printf(".");
+      fflush(stdout);

      int memIndex = TransferBench::GetClosestCpuNumaToGpu(gpuId);
      if (memIndex == -1) {
@@ -184,62 +277,65 @@ void HealthCheckPreset(EnvVars&           ev,
      Transfer& t0 = transfers[0];
      Transfer& t1 = transfers[1];

-      t0.exeDevice = {EXE_GPU_GFX, gpuId};
-      t0.numBytes  = 64*1024*1024;
-      t0.srcs      = {{MEM_CPU, memIndex}};
-      t0.dsts      = {};
+      t0.exeDevice   = {EXE_GPU_DMA, gpuId};
+      t0.numBytes    = 256*1024*1024;
+      t0.srcs        = {{MEM_GPU, gpuId}};
+      t0.dsts        = {{MEM_CPU, memIndex}};
+      t0.numSubExecs = 1;

-      t1.exeDevice = {EXE_GPU_GFX, gpuId};
-      t1.numBytes  = 64*1024*1024;
-      t1.srcs      = {};
-      t1.dsts      = {{MEM_CPU, memIndex}};
-
-      // Loop over number of CUs to use
-      bool passed = false;
-      double bestResult = 0;
-      for (int cu = 7; cu <= 10; cu++) {
-        t0.numSubExecs = cu;
-        t1.numSubExecs = cu;
+      t1.exeDevice   = {EXE_GPU_DMA, gpuId};
+      t1.numBytes    = 256*1024*1024;
+      t1.srcs        = {{MEM_CPU, memIndex}};
+      t1.dsts        = {{MEM_GPU, gpuId}};
+      t1.numSubExecs = 1;

+      TransferBench::TestResults results;
      if (TransferBench::RunTransfers(cfg, transfers, results)) {
-          double sum = (results.tfrResults[0].avgBandwidthGbPerSec +
+        double measuredBw = (results.tfrResults[0].avgBandwidthGbPerSec +
                             results.tfrResults[1].avgBandwidthGbPerSec);
-          bestResult = std::max(bestResult, sum);
-          if (sum >= bdirLimit) {
-            passed = true;
-            break;
+        if (measuredBw < limit) {
+          fails.push_back(std::make_pair(gpuId, measuredBw));
        }
+        if (verbose) printf("   GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
      } else {
        PrintErrors(results.errResults);
      }
    }
-      if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
-    }
+
    if (fails.size() == 0) {
      printf("PASS\n");
    } else {
-      hasFail = true;
+      hasFail = 1;
      printf("FAIL (%lu test(s))\n", fails.size());
      for (auto p : fails) {
-        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, limit);
      }
    }
  }
+  return hasFail;
+}

-  // Run XGMI tests:
-  printf("Testing all-to-all XGMI copies        "); fflush(stdout);
+int TestAllToAll(int modelId, bool verbose)
+{
+  TestConfig const& testConfig = TestConfigs[modelId];
+  TransferBench::ConfigOptions cfg;
+  cfg.gfx.unrollFactor = testConfig.a2aUnrollFactor;
+
+  int numSubExecs = testConfig.a2aNumSubExecs;
+  int hasFail = 0;
+  int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  printf("Testing all-to-all XGMI copies            %c", verbose ? '\n' : ' '); fflush(stdout);
  {
-    // Force GFX unroll to 2 for MI300
-    ev.gfxUnroll = 2;
-    TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+    double limit = testConfig.a2aCopyLimit * SFACTOR;

    std::vector<Transfer> transfers;
    for (int i = 0; i < numGpuDevices; i++) {
      for (int j = 0; j < numGpuDevices; j++) {
        if (i == j) continue;
        Transfer t;
-        t.numBytes    = 64*1024*1024;
-        t.numSubExecs = 8;
+        t.numBytes    = 256*1024*1024;
+        t.numSubExecs = numSubExecs;
        t.exeDevice   = {EXE_GPU_GFX, i};
        t.srcs        = {{MEM_GPU_FINE, i}};
        t.dsts        = {{MEM_GPU_FINE, j}};
@@ -247,17 +343,18 @@ void HealthCheckPreset(EnvVars&           ev,
      }
    }
    std::vector<std::pair<std::pair<int,int>, double>> fails;
-
+    TransferBench::TestResults results;
    if (TransferBench::RunTransfers(cfg, transfers, results)) {
      int transferIdx = 0;
      for (int i = 0; i < numGpuDevices; i++) {
-        printf("."); fflush(stdout);
+        if (!verbose) printf("."); fflush(stdout);
        for (int j = 0; j < numGpuDevices; j++) {
          if (i == j) continue;
          double bw = results.tfrResults[transferIdx].avgBandwidthGbPerSec;
-          if (bw < a2aLimit) {
+          if (bw < limit) {
            fails.push_back(std::make_pair(std::make_pair(i,j), bw));
          }
+          if (verbose) printf("   GPU %02d to GPU %02d: : Measured %6.2f Limit %6.2f\n", i, j, bw, limit);
          transferIdx++;
        }
      }
@@ -265,12 +362,111 @@ void HealthCheckPreset(EnvVars&           ev,
    if (fails.size() == 0) {
      printf("PASS\n");
    } else {
-      hasFail = true;
+      hasFail = 1;
      printf("FAIL (%lu test(s))\n", fails.size());
      for (auto p : fails) {
-        printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
+        printf(" GPU %02d to GPU %02d: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, limit);
      }
    }
  }
-  exit(hasFail ? 1 : 0);
+  return hasFail;
+}
+
+int TestHbmPerformance(int modelId, bool verbose)
+{
+  TestConfig const& testConfig = TestConfigs[modelId];
+
+  int hasFail = 0;
+  int numGpuDevices = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+  char testname[50];
+
+  for (int testId = 0; testId < NUM_HBM_TESTS; testId++) {
+    TransferBench::ConfigOptions cfg;
+    cfg.general.numIterations = 1000;
+    cfg.general.numWarmups    = 50;
+    cfg.gfx.blockSize         = testConfig.hbmBlockSize[testId];
+    cfg.gfx.unrollFactor      = testConfig.hbmUnrollFactor[testId];
+    cfg.gfx.temporalMode      = testConfig.hbmTemporalMode[testId];
+
+    sprintf(testname, "Testing HBM performance [%s]", HbmTestConfigs[testId].name.c_str());
+    if (verbose) printf("[Blocksize: %d Unroll: %d TemporalMode: %d]\n", cfg.gfx.blockSize, cfg.gfx.unrollFactor, cfg.gfx.temporalMode);
+    printf("%-42s%c", testname, verbose ? '\n' : ' ');
+    fflush(stdout);
+
+    int numInputs = HbmTestConfigs[testId].numInputs;
+    int numOutputs = HbmTestConfigs[testId].numOutputs;
+
+    double limit = testConfig.hbmLimit[testId] * SFACTOR;
+
+    std::vector<std::pair<int, double>> fails;
+    TransferBench::TestResults results;
+    std::vector<Transfer> transfers;
+
+    for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+      Transfer t;
+      t.numSubExecs = TransferBench::GetNumSubExecutors({EXE_GPU_GFX, gpuId});
+      t.numBytes    = 16777216ULL * t.numSubExecs;
+      t.exeDevice   = {EXE_GPU_GFX, gpuId};
+      for (int i = 0; i < numInputs; i++) t.srcs.push_back({MEM_GPU, gpuId});
+      for (int i = 0; i < numOutputs; i++) t.dsts.push_back({MEM_GPU, gpuId});
+      transfers.push_back(t);
+    }
+
+    if (TransferBench::RunTransfers(cfg, transfers, results)) {
+      for (int gpuId = 0; gpuId < numGpuDevices; gpuId++) {
+        if (!verbose) printf(".");
+        fflush(stdout);
+        double measuredBw = results.tfrResults[gpuId].avgBandwidthGbPerSec;
+        if (measuredBw < limit) {
+          fails.push_back(std::make_pair(gpuId, measuredBw));
+        }
+        if (verbose) printf("   GPU %02d: Measured %6.2f Limit %6.2f\n", gpuId, measuredBw, limit);
+      }
+    } else {
+      PrintErrors(results.errResults);
+    }
+
+    if (fails.size() == 0) {
+      printf("PASS\n");
+    } else {
+      hasFail = 1;
+      printf("FAIL (%lu test(s))\n", fails.size());
+      for (auto p : fails) {
+        printf(" GPU %02d: Measured: %6.2f GB/s      Criteria: %6.2f GB/s\n", p.first, p.second, limit);
+      }
+    }
+  }
+  return hasFail;
+}
+
+void HealthCheckPreset(EnvVars&           ev,
+                       size_t      const  numBytesPerTransfer,
+                       std::string const  presetName)
+{
+  // Check for supported platforms
+#if defined(__NVCC__)
+  printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
+  return;
+#endif
+
+  printf("Disclaimer:\n");
+  printf("==================================================================\n");
+  printf("NOTE: This is an experimental feature and may be subject to change\n");
+  printf("      Failures do not necessarily indicate hardware issues, as other factors\n");
+  printf("      such as simultaneous workloads may influence results\n");
+  printf("\n");
+
+  // Collect custom env vars for this preset
+  int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
+
+  // Determine if this is a supported model
+  int modelId = DetectModel();
+
+  // Run through all tests
+  int numFails = 0;
+  numFails += TestHbmPerformance(modelId, verbose);
+  numFails += TestUnidir(modelId, verbose);
+  numFails += TestBidir(modelId, verbose);
+  numFails += TestAllToAll(modelId, verbose);
+  exit(numFails ? 1 : 0);
 }
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -66,7 +66,7 @@ namespace TransferBench
  using std::set;
  using std::vector;

-  constexpr char VERSION[] = "1.62";
+  constexpr char VERSION[] = "1.63";

  /**
   * Enumeration of supported Executor types
@@ -516,7 +516,7 @@ namespace TransferBench
 //==========================================================================================

 // Macro for collecting CU/SM GFX kernel is running on
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1200__) || defined(__gfx1201__)
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1150__) || defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__)
 #define GetHwId(hwId) hwId = 0
 #elif defined(__NVCC__)
 #define GetHwId(hwId) asm("mov.u32 %0, %smid;" : "=r"(hwId))
@@ -525,7 +525,7 @@ namespace TransferBench
 #endif

 // Macro for collecting XCC GFX kernel is running on
-#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__)
 #define GetXccId(val) asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
 #else
 #define GetXccId(val) val = 0
@@ -755,7 +755,7 @@ namespace {
 #if defined (__NVCC__)
        return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"};
 #else
-        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+        ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocCoherent));
 #endif
      } else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
 #if defined (__NVCC__)
@@ -895,6 +895,8 @@ namespace {
  // Get the hsa_agent_t associated with a MemDevice
  static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
  {
+    if (memDevice.memType == MEM_CPU_CLOSEST)
+      return GetHsaAgent({EXE_CPU, GetClosestCpuNumaToGpu(memDevice.memIndex)}, agent);
    if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
    if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
    return {ERR_FATAL,
@@ -1191,6 +1193,8 @@ namespace {
            if (err.errType == ERR_FATAL) break;
          }

+          // Skip check of engine Id mask for self copies
+          if (srcAgent.handle != dstAgent.handle) {
            uint32_t engineIdMask = 0;
            err = hsa_amd_memory_copy_engine_status(dstAgent, srcAgent, &engineIdMask);
            if (err.errType != ERR_NONE) {
@@ -1203,6 +1207,7 @@ namespace {
                  "Transfer %d: DMA %d.%d does not exist or cannot copy between src/dst",
                  i, t.exeDevice.exeIndex, t.exeSubIndex});
            }
+          }
 #endif
        }

@@ -2624,7 +2629,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    childThreads.clear();

    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    double deltaMsec = (std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0) / cfg.general.numSubIterations;

    if (iteration >= 0) {
      rss.totalDurationMsec += deltaMsec;
@@ -2654,7 +2659,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      asyncTransfer.join();

    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
+
    if (iteration >= 0)
      exeInfo.totalDurationMsec += deltaMsec;
    return ERR_NONE;
@@ -2692,20 +2698,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      if (numaNode != -1)
        numa_run_on_node(numaNode);
    }
+
+    auto transferCount = exeInfo.resources.size();
+    std::vector<double> totalTimeMsec(transferCount, 0.0);
+
    int subIterations = 0;
-    do {
    auto cpuStart = std::chrono::high_resolution_clock::now();
-      size_t completedTransfers = 0;
-      auto transferCount = exeInfo.resources.size();
-      std::vector<uint8_t> receivedQPs(transferCount);
    std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
+
+    do {
+      std::vector<uint8_t> receivedQPs(transferCount, 0);
      // post the sends
      for (auto i = 0; i < transferCount; i++) {
        transferTimers[i] = std::chrono::high_resolution_clock::now();
        ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
      }
      // poll for completions
-      do {
+      size_t completedTransfers = 0;
+      while (completedTransfers < transferCount) {
        for (auto i = 0; i < transferCount; i++) {
          if(receivedQPs[i] < exeInfo.resources[i].qpCount) {
            auto& rss = exeInfo.resources[i];
@@ -2725,20 +2735,28 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
              auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i];
              double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
              if (iteration >= 0) {
-                rss.totalDurationMsec += deltaMsec;
-                if (cfg.general.recordPerIteration)
-                  rss.perIterMsec.push_back(deltaMsec);
+                totalTimeMsec[i] += deltaMsec;
              }
              completedTransfers++;
            }
          }
        }
-      } while(completedTransfers < transferCount);
+      }
+    } while(++subIterations < cfg.general.numSubIterations);
+
    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-      double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-      if (iteration >= 0)
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
+
+    if (iteration >= 0) {
      exeInfo.totalDurationMsec += deltaMsec;
-    } while(++subIterations < cfg.general.numSubIterations);
+      for (int i = 0; i < transferCount; i++) {
+        auto& rss = exeInfo.resources[i];
+        double transferTimeMsec = totalTimeMsec[i] / cfg.general.numSubIterations;
+        rss.totalDurationMsec += transferTimeMsec;
+        if (cfg.general.recordPerIteration)
+          rss.perIterMsec.push_back(transferTimeMsec);
+      }
+    }
    return ERR_NONE;
  }
 #endif
@@ -3077,14 +3095,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    ERR_CHECK(hipStreamSynchronize(stream));

    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;

    if (iteration >= 0) {
      double deltaMsec = cpuDeltaMsec;
      if (startEvent != NULL) {
        float gpuDeltaMsec;
        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-        deltaMsec = gpuDeltaMsec;
+        deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
      }
      rss.totalDurationMsec += deltaMsec;
      if (cfg.general.recordPerIteration) {
@@ -3154,12 +3172,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      ERR_CHECK(hipStreamSynchronize(stream));
    }
    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0
+      / cfg.general.numSubIterations;

    if (iteration >= 0) {
      if (cfg.gfx.useHipEvents && !cfg.gfx.useMultiStream) {
        float gpuDeltaMsec;
        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, exeInfo.startEvents[0], exeInfo.stopEvents[0]));
+        gpuDeltaMsec /= cfg.general.numSubIterations;
        exeInfo.totalDurationMsec += gpuDeltaMsec;
      } else {
        exeInfo.totalDurationMsec += cpuDeltaMsec;
@@ -3182,7 +3202,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
            }
          }
          double deltaMsec = (maxStopCycle - minStartCycle) / (double)(exeInfo.wallClockRate);
-
+          deltaMsec /= cfg.general.numSubIterations;
          rss.totalDurationMsec += deltaMsec;
          if (cfg.general.recordPerIteration) {
            rss.perIterMsec.push_back(deltaMsec);
@@ -3249,14 +3269,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #endif
    }
    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    double cpuDeltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;

    if (iteration >= 0) {
      double deltaMsec = cpuDeltaMsec;
      if (!useSubIndices && !cfg.dma.useHsaCopy && cfg.dma.useHipEvents) {
        float gpuDeltaMsec;
        ERR_CHECK(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
-        deltaMsec = gpuDeltaMsec;
+        deltaMsec = gpuDeltaMsec / cfg.general.numSubIterations;
      }
      resources.totalDurationMsec += deltaMsec;
      if (cfg.general.recordPerIteration)
@@ -3291,7 +3311,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      ERR_CHECK(asyncTransfer.get());

    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0 / cfg.general.numSubIterations;
    if (iteration >= 0)
      exeInfo.totalDurationMsec += deltaMsec;
    return ERR_NONE;
@@ -3493,7 +3513,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)

       // Stop CPU timing for this iteration
      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-      double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+      double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() / cfg.general.numSubIterations;

      if (cfg.data.alwaysValidate) {
        ERR_APPEND(ValidateAllTransfers(cfg, transfers, transferResources, dstReference, outputBuffer),
@@ -3528,7 +3548,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    results.tfrResults.resize(transfers.size());
    results.numTimedIterations = numTimedIterations;
    results.totalBytesTransferred = 0;
-    results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / (numTimedIterations * cfg.general.numSubIterations);
+    results.avgTotalDurationMsec = (totalCpuTimeSec * 1000.0) / numTimedIterations;
    results.overheadMsec = results.avgTotalDurationMsec;
    for (auto& exeInfoPair : executorMap) {
      ExeDevice const& exeDevice = exeInfoPair.first;
@@ -3537,7 +3557,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      // Copy over executor results
      ExeResult& exeResult = results.exeResults[exeDevice];
      exeResult.numBytes = exeInfo.totalBytes;
-      exeResult.avgDurationMsec = exeInfo.totalDurationMsec / (numTimedIterations * cfg.general.numSubIterations);
+      exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations;
      exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) /  exeResult.avgDurationMsec;
      exeResult.sumBandwidthGbPerSec = 0.0;
      exeResult.transferIdx.clear();

--- a/toolchain-linux.cmake
+++ b/toolchain-linux.cmake
+
+if (DEFINED ENV{ROCM_PATH})
+  set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Path to the ROCm installation.")
+  set(rocm_bin "$ENV{ROCM_PATH}/bin")
+else()
+  set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to the ROCm installation.")
+  set(rocm_bin "/opt/rocm/bin")
+endif()
+
+if (NOT DEFINED ENV{CXX})
+  if(EXISTS "${rocm_bin}/amdclang++")
+    set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
+  else()
+    if(EXISTS "${ROCM_PATH}/llvm/bin/amdclang++")
+      set(rocm_bin "${ROCM_PATH}/llvm/bin")
+      set(CMAKE_CXX_COMPILER "${rocm_bin}/amdclang++" CACHE PATH "Path to the C++ compiler")
+    elseif(EXISTS "${ROCM_PATH}/llvm/bin/clang++")
+      set(rocm_bin "${ROCM_PATH}/llvm/bin")
+      set(CMAKE_CXX_COMPILER "${rocm_bin}/clang++" CACHE PATH "Path to the C++ compiler")
+    endif()
+  endif()
+else()
+  set(CMAKE_CXX_COMPILER "$ENV{CXX}" CACHE PATH "Path to the C++ compiler")
+endif()
+
+if (NOT DEFINED ENV{CXXFLAGS})
+  set(CMAKE_CXX_FLAGS_DEBUG "-g -O1")
+  set(CMAKE_CXX_FLAGS_RELEASE "-O3")
+endif()
+
+if(NOT CMAKE_BUILD_TYPE)
+  message(STATUS "Setting build type to 'Release' as none was specified.")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE)
+endif()