TransferBench v1.61 (#174)

Co-authored-by: Mustafa Abduljabbar <mustafa.abduljabbar@amd.com>

TransferBench v1.61 (#174)
Co-authored-by: Mustafa Abduljabbar <mustafa.abduljabbar@amd.com>
cd80b3a3 · gilbertlee-amd · GitHub · 856e3445 · cd80b3a3 · cd80b3a3
Unverified Commit cd80b3a3 authored Feb 28, 2025 by gilbertlee-amd Committed by GitHub Feb 28, 2025
9 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,22 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.61.00
+### Added
+- Added a2a_n preset which conducts alltoall GPU-to-GPU tranfers over nearest NIC executors
+- Re-implemented GFX_BLOCK_ORDER which allows for control over how threadblocks of multiple transfers are ordered
+  - 0 = sequential, 1 = interleaved, 2 = random
+- Added a2asweep preset which tries various CU/unroll options for GFX-executed all-to-all
+- Rewrite main GID index detection logic
+- Show the GID index and description in the topology table. It is helpful for debugging purposes
+- Added GFX_WORD_SIZE to allow for different packed float sizes to use for GFX kernel.  Must be either 4 (default), 2 or 1
+
+
+### Fixed
+- Avoid build errors for CMake and Makefile if infiniband/verbs.h header is not present and disable NIC executor in such case
+- Have a priority list of which GID entry to go for instead of hardcoding choices based on underdocumented user input (such as RoCE version and IP address family)
+- Use link-local when it is the only choice (i.e. when routing information is not available beyond local link)
+
 ## v1.60.00
 ### Modified
 - Reverted GFX_SINGLE_TEAM default back to 1

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,17 +57,22 @@ set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
 set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
 include_directories(${ROCM_PATH}/include)
 find_library(IBVERBS_LIBRARY ibverbs)
-if (IBVERBS_LIBRARY)
-    if (DEFINED ENV{DISABLE_NIC_EXEC})
+find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
+if (DEFINED ENV{DISABLE_NIC_EXEC})
  message(STATUS "Disabling NIC Executor support")
-    else()
+elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
  message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
  add_definitions(-DNIC_EXEC_ENABLED)
  link_libraries(ibverbs)
-    endif()
 else()
-    message(WARNING "IBVerbs library not found.  Building without NIC executor support")
+  if (NOT IBVERBS_LIBRARY)
+    message(WARNING "IBVerbs library not found")
+  elseif (NOT IBVERBS_INCLUDE_DIR)
+    message(WARNING "infiniband/verbs.h not found")
+  endif()
+  message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
 endif()
+
 link_libraries(numa hsa-runtime64 pthread)
 set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
 add_executable(TransferBench src/client/Client.cpp)

--- a/Makefile
+++ b/Makefile
 #
-# Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 #

 # Configuration options
@@ -12,8 +12,10 @@ NVCC=$(CUDA_PATH)/bin/nvcc
 # Compile TransferBenchCuda if nvcc detected
 ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
  EXE=TransferBenchCuda
+  CXX=$(NVCC)
 else
  EXE=TransferBench
+  CXX=$(HIPCC)
 endif

 CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
@@ -21,14 +23,24 @@ NVFLAGS  = -x cu -lnuma -arch=native
 COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
 LDFLAGS += -lpthread

-# Compile RDMA executor if IBVerbs is found in the Dynamic Linker cache
+# Compile RDMA executor if
+# 1) DISABLE_NIC_EXEC is not set to 1
+# 2) IBVerbs is found in the Dynamic Linker cache
+# 3) infiniband/verbs.h is found in the default include path
 NIC_ENABLED = 0
 ifneq ($(DISABLE_NIC_EXEC),1)
-  ifneq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
+  ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
+    $(info lib IBVerbs not found)
+  else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
+    $(info infiniband/verbs.h not found)
+  else
    LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
    NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
    NIC_ENABLED = 1
  endif
+  ifeq ($(NIC_ENABLED), 0)
+    $(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
+  endif
 endif

 all: $(EXE)

--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
 /*
-Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -84,6 +84,7 @@ public:
  int useHsaDma;                     // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions

  // GFX options
+  int gfxBlockOrder;                 // How threadblocks for multiple Transfers are ordered 0=sequential 1=interleaved
  int gfxBlockSize;                  // Size of each threadblock (must be multiple of 64)
  vector<uint32_t> cuMask;           // Bit-vector representing the CU mask
  vector<vector<int>> prefXccTable;  // Specifies XCC to use for given exe->dst pair
@@ -92,6 +93,7 @@ public:
  int useSingleStream;               // Use a single stream per GPU GFX executor instead of stream per Transfer
  int gfxSingleTeam;                 // Team all subExecutors across the data array
  int gfxWaveOrder;                  // GFX-kernel wavefront ordering
+  int gfxWordSize;                   // GFX-kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)

  // Client options
  int hideEnv;                       // Skip printing environment variable
@@ -135,10 +137,12 @@ public:
    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    gfxBlockOrder     = GetEnvVar("GFX_BLOCK_ORDER"     , 0);
    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
+    gfxWordSize       = GetEnvVar("GFX_WORD_SIZE"       , 4);
    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
    minNumVarSubExec  = GetEnvVar("MIN_VAR_SUBEXEC"     , 1);
    maxNumVarSubExec  = GetEnvVar("MAX_VAR_SUBEXEC"     , 0);
@@ -286,13 +290,23 @@ public:
    }
  }

+  static std::string ToStr(std::vector<int> const& values) {
+    std::string result = "";
+    bool isFirst = true;
+    for (int v : values) {
+      if (isFirst) isFirst = false;
+      else result += ",";
+      result += std::to_string(v);
+    }
+    return result;
+  }
+
  // Display info on the env vars that can be used
  static void DisplayUsage()
  {
    printf("Environment variables:\n");
    printf("======================\n");
    printf(" ALWAYS_VALIDATE   - Validate after each iteration instead of once after all iterations\n");
-    printf(" BLOCK_SIZE        - # of threads per threadblock (Must be multiple of 64)\n");
    printf(" BLOCK_BYTES       - Controls granularity of how work is divided across subExecutors\n");
    printf(" BYTE_OFFSET       - Initial byte-offset for memory allocations.  Must be multiple of 4\n");
 #if NIC_EXEC_ENABLED
@@ -300,9 +314,12 @@ public:
 #endif
    printf(" CU_MASK           - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
    printf(" FILL_PATTERN      - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
+    printf(" GFX_BLOCK_ORDER   - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
+    printf(" GFX_BLOCK_SIZE    - # of threads per threadblock (Must be multiple of 64)\n");
    printf(" GFX_UNROLL        - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
    printf(" GFX_SINGLE_TEAM   - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
    printf(" GFX_WAVE_ORDER    - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
+    printf(" GFX_WORD_SIZE     - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n");
    printf(" HIDE_ENV          - Hide environment variable value listing\n");
 #if NIC_EXEC_ENABLED
    printf(" IB_GID_INDEX      - Required for RoCE NICs (default=-1/auto)\n");
@@ -383,6 +400,8 @@ public:
          "%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
    Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
          "%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
+    Print("GFX_BLOCK_ORDER", gfxBlockOrder,
+          "Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved");
    Print("GFX_BLOCK_SIZE", gfxBlockSize,
          "Threadblock size of %d", gfxBlockSize);
    Print("GFX_SINGLE_TEAM", gfxSingleTeam,
@@ -397,6 +416,9 @@ public:
                                            gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
                                            gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
                                                                "CU,Wavefront,Unroll"));
+    Print("GFX_WORD_SIZE", gfxWordSize,
+          "Using GFX word size of %d (DWORDx%d)", gfxWordSize, gfxWordSize);
+
 #if NIC_EXEC_ENABLED
    Print("IP_ADDRESS_FAMILY", ipAddressFamily,
          "IP address family is set to IPv%d", ipAddressFamily);
@@ -462,6 +484,31 @@ public:
    return defaultValue;
  }

+  static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
+  {
+    if (getenv(varname.c_str())) {
+      char* rangeStr = getenv(varname.c_str());
+      std::set<int> values;
+      char* token = strtok(rangeStr, ",");
+      while (token) {
+        int start, end;
+        if (sscanf(token, "%d-%d", &start, &end) == 2) {
+          for (int i = start; i <= end; i++) values.insert(i);
+        } else if (sscanf(token, "%d", &start) == 1) {
+          values.insert(start);
+        } else {
+          printf("[ERROR] Unrecognized token [%s]\n", token);
+          exit(1);
+        }
+        token = strtok(NULL, ",");
+      }
+      std::vector<int> result;
+      for (auto v : values) result.push_back(v);
+      return result;
+    }
+    return defaultValue;
+  }
+
  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
  {
    if (getenv(varname.c_str()))
@@ -524,6 +571,7 @@ public:
    cfg.dma.useHipEvents           = useHipEvents;
    cfg.dma.useHsaCopy             = useHsaDma;

+    cfg.gfx.blockOrder             = gfxBlockOrder;
    cfg.gfx.blockSize              = gfxBlockSize;
    cfg.gfx.cuMask                 = cuMask;
    cfg.gfx.prefXccTable           = prefXccTable;
@@ -532,6 +580,7 @@ public:
    cfg.gfx.useMultiStream         = !useSingleStream;
    cfg.gfx.useSingleTeam          = gfxSingleTeam;
    cfg.gfx.waveOrder              = gfxWaveOrder;
+    cfg.gfx.wordSize               = gfxWordSize;

    cfg.nic.ibGidIndex             = ibGidIndex;
    cfg.nic.ibPort                 = ibPort;

--- a/src/client/Presets/AllToAllN.hpp
+++ b/src/client/Presets/AllToAllN.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "EnvVars.hpp"
+
+void AllToAllRdmaPreset(EnvVars&           ev,
+                        size_t      const  numBytesPerTransfer,
+                        std::string const  presetName)
+{
+
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
+  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
+
+  // Print off environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[AllToAll Network Related]\n");
+    ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
+    ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
+    ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    printf("\n");
+  }
+
+  // Validate env vars
+  if (numGpus < 0 || numGpus > numDetectedGpus) {
+    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    exit(1);
+  }
+
+  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+
+  std::map<std::pair<int, int>, int> reIndex;
+  std::vector<Transfer> transfers;
+  for (int i = 0; i < numGpus; i++) {
+    for (int j = 0; j < numGpus; j++) {
+      // Build Transfer and add it to list
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      transfer.srcs.push_back({memType, i});
+      transfer.dsts.push_back({memType, j});
+      transfer.exeDevice = {EXE_NIC_NEAREST, i};
+      transfer.exeSubIndex = j;
+      transfer.numSubExecs = numQueuePairs;
+
+      reIndex[std::make_pair(i,j)] = transfers.size();
+      transfers.push_back(transfer);
+    }
+  }
+
+  printf("GPU-RDMA All-To-All benchmark:\n");
+  printf("==========================\n");
+  printf("- Copying %lu bytes between all pairs of GPUs using %d QPs per Transfer (%lu Transfers)\n",
+         numBytesPerTransfer, numQueuePairs, transfers.size());
+  if (transfers.size() == 0) return;
+
+  // Execute Transfers
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+    for (auto const& err : results.errResults)
+      printf("%s\n", err.errMsg.c_str());
+    exit(0);
+  } else {
+    PrintResults(ev, 1, transfers, results);
+  }
+
+  // Print results
+  char separator = (ev.outputToCsv ? ',' : ' ');
+  printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
+  printf("==========================================================\n");
+  printf("SRC\\DST ");
+  for (int dst = 0; dst < numGpus; dst++)
+    printf("%cGPU %02d    ", separator, dst);
+  printf("   %cSTotal     %cActual\n", separator, separator);
+
+  double totalBandwidthGpu = 0.0;
+  double minActualBandwidth = std::numeric_limits<double>::max();
+  double maxActualBandwidth = 0.0;
+  std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
+  for (int src = 0; src < numGpus; src++) {
+    double rowTotalBandwidth = 0;
+    int    transferCount = 0;
+    double minBandwidth = std::numeric_limits<double>::max();
+    printf("GPU %02d", src);
+    for (int dst = 0; dst < numGpus; dst++) {
+      if (reIndex.count(std::make_pair(src, dst))) {
+        int const transferIdx = reIndex[std::make_pair(src,dst)];
+        TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
+        colTotalBandwidth[dst]  += r.avgBandwidthGbPerSec;
+        rowTotalBandwidth       += r.avgBandwidthGbPerSec;
+        totalBandwidthGpu       += r.avgBandwidthGbPerSec;
+        minBandwidth             = std::min(minBandwidth, r.avgBandwidthGbPerSec);
+        transferCount++;
+        printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
+      } else {
+        printf("%c%8s  ", separator, "N/A");
+      }
+    }
+    double actualBandwidth = minBandwidth * transferCount;
+    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
+    minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
+    maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
+    colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
+  }
+  printf("\nRTotal");
+  for (int dst = 0; dst < numGpus; dst++) {
+    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
+  }
+  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
+         separator, minActualBandwidth, separator, maxActualBandwidth);
+  printf("\n");
+
+  printf("Average   bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
+  printf("Aggregate bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu);
+  printf("Aggregate bandwidth       (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+
+  PrintErrors(results.errResults);
+}
--- a/src/client/Presets/AllToAllSweep.hpp
+++ b/src/client/Presets/AllToAllSweep.hpp
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "EnvVars.hpp"
+
+void AllToAllSweepPreset(EnvVars&           ev,
+                         size_t      const  numBytesPerTransfer,
+                         std::string const  presetName)
+{
+  enum
+  {
+    A2A_COPY       = 0,
+    A2A_READ_ONLY  = 1,
+    A2A_WRITE_ONLY = 2,
+    A2A_CUSTOM     = 3,
+  };
+  char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
+
+  // Force single-stream mode for all-to-all benchmark
+  ev.useSingleStream = 1;
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT"     , 1);
+  int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
+  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+  int useSpray      = EnvVars::GetEnvVar("USE_SPRAY",       0);
+  int verbose       = EnvVars::GetEnvVar("VERBOSE",         0);
+
+  std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
+  std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
+
+  // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
+  int numSrcs, numDsts;
+  int a2aMode = 0;
+  if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
+    a2aMode = A2A_CUSTOM;
+  } else {
+    a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
+    if (a2aMode < 0 || a2aMode > 2) {
+      printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
+      exit(1);
+    }
+    numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
+    numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
+  }
+
+  // Print off environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[AllToAll Related]\n");
+    ev.Print("A2A_DIRECT"     , a2aDirect        , a2aDirect ? "Only using direct links" : "Full all-to-all");
+    ev.Print("A2A_LOCAL"      , a2aLocal         , "%s local transfers", a2aLocal ? "Include" : "Exclude");
+    ev.Print("A2A_MODE"       , (a2aMode == A2A_CUSTOM) ?  std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
+                                (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
+                                                           std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
+    ev.Print("NUM_CUS"        , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
+    ev.Print("NUM_GPU_DEVICES", numGpus          , "Using %d GPUs", numGpus);
+    ev.Print("UNROLLS"        , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
+    ev.Print("USE_FINE_GRAIN" , useFineGrain     , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    ev.Print("USE_REMOTE_READ", useRemoteRead    , "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+    ev.Print("USE_SPRAY"      , useSpray         , "%s per CU", useSpray ? "All targets" : "One target");
+    ev.Print("VERBOSE"        , verbose          , verbose ? "Display test results" : "Display summary only");
+    printf("\n");
+  }
+
+  // Validate env vars
+  if (numGpus < 0 || numGpus > numDetectedGpus) {
+    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    exit(1);
+  }
+
+  if (useSpray && numDsts > 1) {
+    printf("[ERROR] Cannot use USE_SPRAY with multiple destination buffers\n");
+    exit(1);
+  }
+
+  // Collect the number of GPU devices to use
+  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+  ExeType exeType = EXE_GPU_GFX;
+
+  std::vector<Transfer> transfers;
+
+  int targetCount = 0;
+  if (!useSpray) {
+    // Each CU will work on just one target
+    for (int i = 0; i < numGpus; i++) {
+      targetCount = 0;
+      for (int j = 0; j < numGpus; j++) {
+        // Check whether or not to execute this pair
+        if (i == j) {
+          if (!a2aLocal) continue;
+        } else if (a2aDirect) {
+#if !defined(__NVCC__)
+          uint32_t linkType, hopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+          if (hopCount != 1) continue;
+#endif
+        }
+
+        // Build Transfer and add it to list
+        TransferBench::Transfer transfer;
+        targetCount++;
+        transfer.numBytes = numBytesPerTransfer;
+        for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, i});
+
+        // When using multiple destinations, the additional destinations are "local"
+        if (numDsts) transfer.dsts.push_back({memType, j});
+        for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
+        transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
+        transfer.exeSubIndex = -1;
+        transfers.push_back(transfer);
+      }
+    }
+  } else {
+    // Each CU will work on all targets
+    for (int i = 0; i < numGpus; i++) {
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      transfer.exeDevice = {exeType, i};
+      transfer.exeSubIndex = -1;
+      targetCount = 0;
+      for (int j = 0; j < numGpus; j++) {
+        // Check whether or not to transfer to this GPU
+        if (i == j) {
+          if (!a2aLocal) continue;
+        } else if (a2aDirect) {
+#if !defined(__NVCC__)
+          uint32_t linkType, hopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+          if (hopCount != 1) continue;
+#endif
+        }
+        targetCount++;
+        for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, useRemoteRead ? j : i});
+
+        if (numDsts) transfer.dsts.push_back({memType, j});
+        for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
+      }
+      transfers.push_back(transfer);
+    }
+  }
+
+  printf("GPU-GFX All-To-All Sweep benchmark:\n");
+  printf("==========================\n");
+  printf("- Copying %lu bytes between %s pairs of GPUs\n", numBytesPerTransfer, a2aDirect ? "directly connected" : "all");
+  if (transfers.size() == 0) {
+    printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
+    return;
+  }
+
+  // Execute Transfers
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+
+  // Run tests
+  std::map<std::pair<int, int>, TransferBench::TestResults> results;
+
+  // Display summary
+  printf("#CUs\\Unroll");
+  for (int u : unrollList) {
+    printf("  %d(Min) ", u);
+    printf("  %d(Max) ", u);
+  }
+  printf("\n");
+  for (int c : numCusList) {
+    printf("   %5d   ", c);  fflush(stdout);
+    for (int u : unrollList) {
+      ev.gfxUnroll = cfg.gfx.unrollFactor = u;
+      for (auto& transfer : transfers)
+        transfer.numSubExecs = useSpray ? (c * targetCount) : c;
+
+      double minBandwidth = std::numeric_limits<double>::max();
+      double maxBandwidth = std::numeric_limits<double>::min();
+      TransferBench::TestResults result;
+      if (TransferBench::RunTransfers(cfg, transfers, result)) {
+        for (auto const& exeResult : result.exeResults) {
+          minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
+	  maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
+	}
+        if (useSpray) {
+	  minBandwidth *= targetCount;
+	  maxBandwidth *= targetCount;
+	}
+        results[std::make_pair(c,u)] = result;
+      } else {
+        minBandwidth = 0.0;
+      }
+      printf(" %7.2f  %7.2f ", minBandwidth, maxBandwidth); fflush(stdout);
+    }
+    printf("\n"); fflush(stdout);
+  }
+
+  if (verbose) {
+    int testNum = 0;
+    for (int c : numCusList) {
+      for (int u : unrollList) {
+        printf("CUs: %d Unroll %d\n", c, u);
+        PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
+      }
+    }
+  }
+}
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
@@ -24,6 +24,8 @@ THE SOFTWARE.

 // Included after EnvVars and Executors
 #include "AllToAll.hpp"
+#include "AllToAllN.hpp"
+#include "AllToAllSweep.hpp"
 #include "HealthCheck.hpp"
 #include "OneToAll.hpp"
 #include "PeerToPeer.hpp"
@@ -39,9 +41,11 @@ typedef void (*PresetFunc)(EnvVars&          ev,
 std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
 {
  {"a2a",         {AllToAllPreset,      "Tests parallel transfers between all pairs of GPU devices"}},
-  {"healthcheck", {HealthCheckPreset,"Simple bandwidth health check (MI300X series only)"}},
+  {"a2a_n",       {AllToAllRdmaPreset,  "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
+  {"a2asweep",    {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
+  {"healthcheck", {HealthCheckPreset,   "Simple bandwidth health check (MI300X series only)"}},
  {"one2all",     {OneToAllPreset,      "Test all subsets of parallel transfers from one GPU to all others"}},
-  {"p2p"   ,      {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}},
+  {"p2p"   ,      {PeerToPeerPreset,    " Peer-to-peer device memory bandwidth test"}},
  {"rsweep",      {SweepPreset,         "Randomly sweep through sets of Transfers"}},
  {"scaling",     {ScalingPreset,       "Run scaling test from one GPU to other devices"}},
  {"schmoo",      {SchmooPreset,        "Scaling tests for local/remote read/write/copy"}},

--- a/src/client/Topology.hpp
+++ b/src/client/Topology.hpp
@@ -41,9 +41,9 @@ static int RemappedCpuIndex(int origIdx)
 static void PrintNicToGPUTopo(bool outputToCsv)
 {
 #ifdef NIC_EXEC_ENABLED
-  printf(" NIC | Device Name | Active | PCIe Bus ID  | NUMA | Closest GPU(s)\n");
+  printf(" NIC | Device Name | Active | PCIe Bus ID  | NUMA | Closest GPU(s) | GID Index | GID Descriptor\n");
  if(!outputToCsv)
-    printf("-----+-------------+--------+--------------+------+---------------\n");
+    printf("-----+-------------+--------+--------------+------+----------------+-----------+-------------------\n");

  int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
  auto const& ibvDeviceList = GetIbvDeviceList();
@@ -57,12 +57,15 @@ static void PrintNicToGPUTopo(bool outputToCsv)
      }
    }

-    printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-20s\n",
+    printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-14s | %-9s | %-20s\n",
           i, ibvDeviceList[i].name.c_str(),
           ibvDeviceList[i].hasActivePort ? "Yes" : "No",
           ibvDeviceList[i].busId.c_str(),
           ibvDeviceList[i].numaNode,
-           closestGpusStr.c_str());
+           closestGpusStr.c_str(),
+           ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort?  std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A",
+           ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort?  ibvDeviceList[i].gidDescriptor.c_str() : "N/A"
+          );
  }
  printf("\n");
 #endif

--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp