TransferBench v1.61 (#174)

Co-authored-by: Mustafa Abduljabbar <mustafa.abduljabbar@amd.com>

TransferBench v1.61 (#174)
Co-authored-by: Mustafa Abduljabbar <mustafa.abduljabbar@amd.com>
cd80b3a3 · gilbertlee-amd · GitHub · 856e3445 · cd80b3a3 · cd80b3a3
Unverified Commit cd80b3a3 authored Feb 28, 2025 by gilbertlee-amd Committed by GitHub Feb 28, 2025
9 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,22 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.61.00
+### Added
+- Added a2a_n preset which conducts alltoall GPU-to-GPU tranfers over nearest NIC executors
+- Re-implemented GFX_BLOCK_ORDER which allows for control over how threadblocks of multiple transfers are ordered
+  - 0 = sequential, 1 = interleaved, 2 = random
+- Added a2asweep preset which tries various CU/unroll options for GFX-executed all-to-all
+- Rewrite main GID index detection logic
+- Show the GID index and description in the topology table. It is helpful for debugging purposes
+- Added GFX_WORD_SIZE to allow for different packed float sizes to use for GFX kernel.  Must be either 4 (default), 2 or 1
+
+
+### Fixed
+- Avoid build errors for CMake and Makefile if infiniband/verbs.h header is not present and disable NIC executor in such case
+- Have a priority list of which GID entry to go for instead of hardcoding choices based on underdocumented user input (such as RoCE version and IP address family)
+- Use link-local when it is the only choice (i.e. when routing information is not available beyond local link)
+
 ## v1.60.00
 ### Modified
 - Reverted GFX_SINGLE_TEAM default back to 1

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -57,17 +57,22 @@ set( CMAKE_CXX_FLAGS "${flags_str} ${CMAKE_CXX_FLAGS}")
 set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -L${ROCM_PATH}/lib")
 include_directories(${ROCM_PATH}/include)
 find_library(IBVERBS_LIBRARY ibverbs)
-if (IBVERBS_LIBRARY)
-    if (DEFINED ENV{DISABLE_NIC_EXEC})
-        message(STATUS "Disabling NIC Executor support")
-    else()
-        message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}.  Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
-        add_definitions(-DNIC_EXEC_ENABLED)
-        link_libraries(ibverbs)
-    endif()
+find_path(IBVERBS_INCLUDE_DIR infiniband/verbs.h)
+if (DEFINED ENV{DISABLE_NIC_EXEC})
+  message(STATUS "Disabling NIC Executor support")
+elseif(IBVERBS_LIBRARY AND IBVERBS_INCLUDE_DIR)
+  message(STATUS "Found ibverbs: ${IBVERBS_LIBRARY}. Building with NIC executor support. Can set DISABLE_NIC_EXEC=1 to disable")
+  add_definitions(-DNIC_EXEC_ENABLED)
+  link_libraries(ibverbs)
 else()
-    message(WARNING "IBVerbs library not found.  Building without NIC executor support")
+  if (NOT IBVERBS_LIBRARY)
+    message(WARNING "IBVerbs library not found")
+  elseif (NOT IBVERBS_INCLUDE_DIR)
+    message(WARNING "infiniband/verbs.h not found")
+  endif()
+  message(WARNING "Building without NIC executor support. To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed")
 endif()
+
 link_libraries(numa hsa-runtime64 pthread)
 set (CMAKE_RUNTIME_OUTPUT_DIRECTORY .)
 add_executable(TransferBench src/client/Client.cpp)

--- a/Makefile
+++ b/Makefile
 #
-# Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
 #

 # Configuration options
@@ -12,8 +12,10 @@ NVCC=$(CUDA_PATH)/bin/nvcc
 # Compile TransferBenchCuda if nvcc detected
 ifeq ("$(shell test -e $(NVCC) && echo found)", "found")
  EXE=TransferBenchCuda
+  CXX=$(NVCC)
 else
  EXE=TransferBench
+  CXX=$(HIPCC)
 endif

 CXXFLAGS = -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
@@ -21,14 +23,24 @@ NVFLAGS  = -x cu -lnuma -arch=native
 COMMON_FLAGS = -O3 -I./src/header -I./src/client -I./src/client/Presets
 LDFLAGS += -lpthread

-# Compile RDMA executor if IBVerbs is found in the Dynamic Linker cache
+# Compile RDMA executor if
+# 1) DISABLE_NIC_EXEC is not set to 1
+# 2) IBVerbs is found in the Dynamic Linker cache
+# 3) infiniband/verbs.h is found in the default include path
 NIC_ENABLED = 0
 ifneq ($(DISABLE_NIC_EXEC),1)
-  ifneq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
+  ifeq ("$(shell ldconfig -p | grep -c ibverbs)", "0")
+    $(info lib IBVerbs not found)
+  else ifeq ("$(shell echo '#include <infiniband/verbs.h>' | $(CXX) -E - 2>/dev/null | grep -c 'infiniband/verbs.h')", "0")
+    $(info infiniband/verbs.h not found)
+  else
    LDFLAGS += -libverbs -DNIC_EXEC_ENABLED
    NVFLAGS += -libverbs -DNIC_EXEC_ENABLED
    NIC_ENABLED = 1
  endif
+  ifeq ($(NIC_ENABLED), 0)
+    $(info To use the TransferBench RDMA executor, check if your system has NICs, the NIC drivers are installed, and libibverbs-dev is installed)
+  endif
 endif

 all: $(EXE)

--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
 /*
-Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -84,6 +84,7 @@ public:
  int useHsaDma;                     // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions

  // GFX options
+  int gfxBlockOrder;                 // How threadblocks for multiple Transfers are ordered 0=sequential 1=interleaved
  int gfxBlockSize;                  // Size of each threadblock (must be multiple of 64)
  vector<uint32_t> cuMask;           // Bit-vector representing the CU mask
  vector<vector<int>> prefXccTable;  // Specifies XCC to use for given exe->dst pair
@@ -92,6 +93,7 @@ public:
  int useSingleStream;               // Use a single stream per GPU GFX executor instead of stream per Transfer
  int gfxSingleTeam;                 // Team all subExecutors across the data array
  int gfxWaveOrder;                  // GFX-kernel wavefront ordering
+  int gfxWordSize;                   // GFX-kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)

  // Client options
  int hideEnv;                       // Skip printing environment variable
@@ -135,10 +137,12 @@ public:
    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    gfxBlockOrder     = GetEnvVar("GFX_BLOCK_ORDER"     , 0);
    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
+    gfxWordSize       = GetEnvVar("GFX_WORD_SIZE"       , 4);
    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
    minNumVarSubExec  = GetEnvVar("MIN_VAR_SUBEXEC"     , 1);
    maxNumVarSubExec  = GetEnvVar("MAX_VAR_SUBEXEC"     , 0);
@@ -286,13 +290,23 @@ public:
    }
  }

+  static std::string ToStr(std::vector<int> const& values) {
+    std::string result = "";
+    bool isFirst = true;
+    for (int v : values) {
+      if (isFirst) isFirst = false;
+      else result += ",";
+      result += std::to_string(v);
+    }
+    return result;
+  }
+
  // Display info on the env vars that can be used
  static void DisplayUsage()
  {
    printf("Environment variables:\n");
    printf("======================\n");
    printf(" ALWAYS_VALIDATE   - Validate after each iteration instead of once after all iterations\n");
-    printf(" BLOCK_SIZE        - # of threads per threadblock (Must be multiple of 64)\n");
    printf(" BLOCK_BYTES       - Controls granularity of how work is divided across subExecutors\n");
    printf(" BYTE_OFFSET       - Initial byte-offset for memory allocations.  Must be multiple of 4\n");
 #if NIC_EXEC_ENABLED
@@ -300,9 +314,12 @@ public:
 #endif
    printf(" CU_MASK           - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
    printf(" FILL_PATTERN      - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
+    printf(" GFX_BLOCK_ORDER   - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
+    printf(" GFX_BLOCK_SIZE    - # of threads per threadblock (Must be multiple of 64)\n");
    printf(" GFX_UNROLL        - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
    printf(" GFX_SINGLE_TEAM   - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
    printf(" GFX_WAVE_ORDER    - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
+    printf(" GFX_WORD_SIZE     - GFX kernel packed data size (4=DWORDx4, 2=DWORDx2, 1=DWORDx1)\n");
    printf(" HIDE_ENV          - Hide environment variable value listing\n");
 #if NIC_EXEC_ENABLED
    printf(" IB_GID_INDEX      - Required for RoCE NICs (default=-1/auto)\n");
@@ -383,6 +400,8 @@ public:
          "%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
    Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
          "%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
+    Print("GFX_BLOCK_ORDER", gfxBlockOrder,
+          "Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved");
    Print("GFX_BLOCK_SIZE", gfxBlockSize,
          "Threadblock size of %d", gfxBlockSize);
    Print("GFX_SINGLE_TEAM", gfxSingleTeam,
@@ -397,6 +416,9 @@ public:
                                            gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
                                            gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
                                                                "CU,Wavefront,Unroll"));
+    Print("GFX_WORD_SIZE", gfxWordSize,
+          "Using GFX word size of %d (DWORDx%d)", gfxWordSize, gfxWordSize);
+
 #if NIC_EXEC_ENABLED
    Print("IP_ADDRESS_FAMILY", ipAddressFamily,
          "IP address family is set to IPv%d", ipAddressFamily);
@@ -462,6 +484,31 @@ public:
    return defaultValue;
  }

+  static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
+  {
+    if (getenv(varname.c_str())) {
+      char* rangeStr = getenv(varname.c_str());
+      std::set<int> values;
+      char* token = strtok(rangeStr, ",");
+      while (token) {
+        int start, end;
+        if (sscanf(token, "%d-%d", &start, &end) == 2) {
+          for (int i = start; i <= end; i++) values.insert(i);
+        } else if (sscanf(token, "%d", &start) == 1) {
+          values.insert(start);
+        } else {
+          printf("[ERROR] Unrecognized token [%s]\n", token);
+          exit(1);
+        }
+        token = strtok(NULL, ",");
+      }
+      std::vector<int> result;
+      for (auto v : values) result.push_back(v);
+      return result;
+    }
+    return defaultValue;
+  }
+
  static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
  {
    if (getenv(varname.c_str()))
@@ -524,6 +571,7 @@ public:
    cfg.dma.useHipEvents           = useHipEvents;
    cfg.dma.useHsaCopy             = useHsaDma;

+    cfg.gfx.blockOrder             = gfxBlockOrder;
    cfg.gfx.blockSize              = gfxBlockSize;
    cfg.gfx.cuMask                 = cuMask;
    cfg.gfx.prefXccTable           = prefXccTable;
@@ -532,12 +580,13 @@ public:
    cfg.gfx.useMultiStream         = !useSingleStream;
    cfg.gfx.useSingleTeam          = gfxSingleTeam;
    cfg.gfx.waveOrder              = gfxWaveOrder;
+    cfg.gfx.wordSize               = gfxWordSize;

-    cfg.nic.ibGidIndex            = ibGidIndex;
-    cfg.nic.ibPort                = ibPort;
-    cfg.nic.ipAddressFamily       = ipAddressFamily;
-    cfg.nic.useRelaxedOrder       = nicRelaxedOrder;
-    cfg.nic.roceVersion           = roceVersion;
+    cfg.nic.ibGidIndex             = ibGidIndex;
+    cfg.nic.ibPort                 = ibPort;
+    cfg.nic.ipAddressFamily        = ipAddressFamily;
+    cfg.nic.useRelaxedOrder        = nicRelaxedOrder;
+    cfg.nic.roceVersion            = roceVersion;

    std::vector<int> closestNics;
    if(closestNicStr != "") {

--- a/src/client/Presets/AllToAllN.hpp
+++ b/src/client/Presets/AllToAllN.hpp
+/*
+Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "EnvVars.hpp"
+
+void AllToAllRdmaPreset(EnvVars&           ev,
+                        size_t      const  numBytesPerTransfer,
+                        std::string const  presetName)
+{
+
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int numQueuePairs = EnvVars::GetEnvVar("NUM_QUEUE_PAIRS", 1);
+  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
+
+  // Print off environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[AllToAll Network Related]\n");
+    ev.Print("NUM_GPU_DEVICES", numGpus      , "Using %d GPUs", numGpus);
+    ev.Print("NUM_QUEUE_PAIRS", numQueuePairs, "Using %d queue pairs for NIC transfers", numQueuePairs);
+    ev.Print("USE_FINE_GRAIN" , useFineGrain , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    printf("\n");
+  }
+
+  // Validate env vars
+  if (numGpus < 0 || numGpus > numDetectedGpus) {
+    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    exit(1);
+  }
+
+  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+
+  std::map<std::pair<int, int>, int> reIndex;
+  std::vector<Transfer> transfers;
+  for (int i = 0; i < numGpus; i++) {
+    for (int j = 0; j < numGpus; j++) {
+      // Build Transfer and add it to list
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      transfer.srcs.push_back({memType, i});
+      transfer.dsts.push_back({memType, j});
+      transfer.exeDevice = {EXE_NIC_NEAREST, i};
+      transfer.exeSubIndex = j;
+      transfer.numSubExecs = numQueuePairs;
+
+      reIndex[std::make_pair(i,j)] = transfers.size();
+      transfers.push_back(transfer);
+    }
+  }
+
+  printf("GPU-RDMA All-To-All benchmark:\n");
+  printf("==========================\n");
+  printf("- Copying %lu bytes between all pairs of GPUs using %d QPs per Transfer (%lu Transfers)\n",
+         numBytesPerTransfer, numQueuePairs, transfers.size());
+  if (transfers.size() == 0) return;
+
+  // Execute Transfers
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+  TransferBench::TestResults results;
+  if (!TransferBench::RunTransfers(cfg, transfers, results)) {
+    for (auto const& err : results.errResults)
+      printf("%s\n", err.errMsg.c_str());
+    exit(0);
+  } else {
+    PrintResults(ev, 1, transfers, results);
+  }
+
+  // Print results
+  char separator = (ev.outputToCsv ? ',' : ' ');
+  printf("\nSummary: [%lu bytes per Transfer]\n", numBytesPerTransfer);
+  printf("==========================================================\n");
+  printf("SRC\\DST ");
+  for (int dst = 0; dst < numGpus; dst++)
+    printf("%cGPU %02d    ", separator, dst);
+  printf("   %cSTotal     %cActual\n", separator, separator);
+
+  double totalBandwidthGpu = 0.0;
+  double minActualBandwidth = std::numeric_limits<double>::max();
+  double maxActualBandwidth = 0.0;
+  std::vector<double> colTotalBandwidth(numGpus+2, 0.0);
+  for (int src = 0; src < numGpus; src++) {
+    double rowTotalBandwidth = 0;
+    int    transferCount = 0;
+    double minBandwidth = std::numeric_limits<double>::max();
+    printf("GPU %02d", src);
+    for (int dst = 0; dst < numGpus; dst++) {
+      if (reIndex.count(std::make_pair(src, dst))) {
+        int const transferIdx = reIndex[std::make_pair(src,dst)];
+        TransferBench::TransferResult const& r = results.tfrResults[transferIdx];
+        colTotalBandwidth[dst]  += r.avgBandwidthGbPerSec;
+        rowTotalBandwidth       += r.avgBandwidthGbPerSec;
+        totalBandwidthGpu       += r.avgBandwidthGbPerSec;
+        minBandwidth             = std::min(minBandwidth, r.avgBandwidthGbPerSec);
+        transferCount++;
+        printf("%c%8.3f  ", separator, r.avgBandwidthGbPerSec);
+      } else {
+        printf("%c%8s  ", separator, "N/A");
+      }
+    }
+    double actualBandwidth = minBandwidth * transferCount;
+    printf("   %c%8.3f   %c%8.3f\n", separator, rowTotalBandwidth, separator, actualBandwidth);
+    minActualBandwidth = std::min(minActualBandwidth, actualBandwidth);
+    maxActualBandwidth = std::max(maxActualBandwidth, actualBandwidth);
+    colTotalBandwidth[numGpus+1] += rowTotalBandwidth;
+  }
+  printf("\nRTotal");
+  for (int dst = 0; dst < numGpus; dst++) {
+    printf("%c%8.3f  ", separator, colTotalBandwidth[dst]);
+  }
+  printf("   %c%8.3f   %c%8.3f   %c%8.3f\n", separator, colTotalBandwidth[numGpus+1],
+         separator, minActualBandwidth, separator, maxActualBandwidth);
+  printf("\n");
+
+  printf("Average   bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu / transfers.size());
+  printf("Aggregate bandwidth (Tx Thread Timed): %8.3f GB/s\n", totalBandwidthGpu);
+  printf("Aggregate bandwidth       (CPU Timed): %8.3f GB/s\n", results.avgTotalBandwidthGbPerSec);
+
+  PrintErrors(results.errResults);
+}
--- a/src/client/Presets/AllToAllSweep.hpp
+++ b/src/client/Presets/AllToAllSweep.hpp
+/*
+Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "EnvVars.hpp"
+
+void AllToAllSweepPreset(EnvVars&           ev,
+                         size_t      const  numBytesPerTransfer,
+                         std::string const  presetName)
+{
+  enum
+  {
+    A2A_COPY       = 0,
+    A2A_READ_ONLY  = 1,
+    A2A_WRITE_ONLY = 2,
+    A2A_CUSTOM     = 3,
+  };
+  char a2aModeStr[4][20] = {"Copy", "Read-Only", "Write-Only", "Custom"};
+
+  // Force single-stream mode for all-to-all benchmark
+  ev.useSingleStream = 1;
+
+  int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
+
+  // Collect env vars for this preset
+  int a2aDirect     = EnvVars::GetEnvVar("A2A_DIRECT"     , 1);
+  int a2aLocal      = EnvVars::GetEnvVar("A2A_LOCAL"      , 0);
+  int numGpus       = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus);
+  int useFineGrain  = EnvVars::GetEnvVar("USE_FINE_GRAIN" , 1);
+  int useRemoteRead = EnvVars::GetEnvVar("USE_REMOTE_READ", 0);
+  int useSpray      = EnvVars::GetEnvVar("USE_SPRAY",       0);
+  int verbose       = EnvVars::GetEnvVar("VERBOSE",         0);
+
+  std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
+  std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
+
+  // A2A_MODE may be 0,1,2 or else custom numSrcs:numDsts
+  int numSrcs, numDsts;
+  int a2aMode = 0;
+  if (getenv("A2A_MODE") && sscanf(getenv("A2A_MODE"), "%d:%d", &numSrcs, &numDsts) == 2) {
+    a2aMode = A2A_CUSTOM;
+  } else {
+    a2aMode = EnvVars::GetEnvVar("A2A_MODE", 0);
+    if (a2aMode < 0 || a2aMode > 2) {
+      printf("[ERROR] a2aMode must be between 0 and 2, or else numSrcs:numDsts\n");
+      exit(1);
+    }
+    numSrcs = (a2aMode == A2A_WRITE_ONLY ? 0 : 1);
+    numDsts = (a2aMode == A2A_READ_ONLY  ? 0 : 1);
+  }
+
+  // Print off environment variables
+  ev.DisplayEnvVars();
+  if (!ev.hideEnv) {
+    if (!ev.outputToCsv) printf("[AllToAll Related]\n");
+    ev.Print("A2A_DIRECT"     , a2aDirect        , a2aDirect ? "Only using direct links" : "Full all-to-all");
+    ev.Print("A2A_LOCAL"      , a2aLocal         , "%s local transfers", a2aLocal ? "Include" : "Exclude");
+    ev.Print("A2A_MODE"       , (a2aMode == A2A_CUSTOM) ?  std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
+                                (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
+                                                           std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
+    ev.Print("NUM_CUS"        , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
+    ev.Print("NUM_GPU_DEVICES", numGpus          , "Using %d GPUs", numGpus);
+    ev.Print("UNROLLS"        , unrollList.size(), EnvVars::ToStr(unrollList).c_str());
+    ev.Print("USE_FINE_GRAIN" , useFineGrain     , "Using %s-grained memory", useFineGrain ? "fine" : "coarse");
+    ev.Print("USE_REMOTE_READ", useRemoteRead    , "Using %s as executor", useRemoteRead ? "DST" : "SRC");
+    ev.Print("USE_SPRAY"      , useSpray         , "%s per CU", useSpray ? "All targets" : "One target");
+    ev.Print("VERBOSE"        , verbose          , verbose ? "Display test results" : "Display summary only");
+    printf("\n");
+  }
+
+  // Validate env vars
+  if (numGpus < 0 || numGpus > numDetectedGpus) {
+    printf("[ERROR] Cannot use %d GPUs.  Detected %d GPUs\n", numGpus, numDetectedGpus);
+    exit(1);
+  }
+
+  if (useSpray && numDsts > 1) {
+    printf("[ERROR] Cannot use USE_SPRAY with multiple destination buffers\n");
+    exit(1);
+  }
+
+  // Collect the number of GPU devices to use
+  MemType memType = useFineGrain ? MEM_GPU_FINE : MEM_GPU;
+  ExeType exeType = EXE_GPU_GFX;
+
+  std::vector<Transfer> transfers;
+
+  int targetCount = 0;
+  if (!useSpray) {
+    // Each CU will work on just one target
+    for (int i = 0; i < numGpus; i++) {
+      targetCount = 0;
+      for (int j = 0; j < numGpus; j++) {
+        // Check whether or not to execute this pair
+        if (i == j) {
+          if (!a2aLocal) continue;
+        } else if (a2aDirect) {
+#if !defined(__NVCC__)
+          uint32_t linkType, hopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+          if (hopCount != 1) continue;
+#endif
+        }
+
+        // Build Transfer and add it to list
+        TransferBench::Transfer transfer;
+        targetCount++;
+        transfer.numBytes = numBytesPerTransfer;
+        for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, i});
+
+        // When using multiple destinations, the additional destinations are "local"
+        if (numDsts) transfer.dsts.push_back({memType, j});
+        for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
+        transfer.exeDevice = {exeType, (useRemoteRead ? j : i)};
+        transfer.exeSubIndex = -1;
+        transfers.push_back(transfer);
+      }
+    }
+  } else {
+    // Each CU will work on all targets
+    for (int i = 0; i < numGpus; i++) {
+      TransferBench::Transfer transfer;
+      transfer.numBytes = numBytesPerTransfer;
+      transfer.exeDevice = {exeType, i};
+      transfer.exeSubIndex = -1;
+      targetCount = 0;
+      for (int j = 0; j < numGpus; j++) {
+        // Check whether or not to transfer to this GPU
+        if (i == j) {
+          if (!a2aLocal) continue;
+        } else if (a2aDirect) {
+#if !defined(__NVCC__)
+          uint32_t linkType, hopCount;
+          HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
+          if (hopCount != 1) continue;
+#endif
+        }
+        targetCount++;
+        for (int x = 0; x < numSrcs; x++) transfer.srcs.push_back({memType, useRemoteRead ? j : i});
+
+        if (numDsts) transfer.dsts.push_back({memType, j});
+        for (int x = 1; x < numDsts; x++) transfer.dsts.push_back({memType, i});
+      }
+      transfers.push_back(transfer);
+    }
+  }
+
+  printf("GPU-GFX All-To-All Sweep benchmark:\n");
+  printf("==========================\n");
+  printf("- Copying %lu bytes between %s pairs of GPUs\n", numBytesPerTransfer, a2aDirect ? "directly connected" : "all");
+  if (transfers.size() == 0) {
+    printf("[WARN} No transfers requested. Try adjusting A2A_DIRECT or A2A_LOCAL\n");
+    return;
+  }
+
+  // Execute Transfers
+  TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
+
+  // Run tests
+  std::map<std::pair<int, int>, TransferBench::TestResults> results;
+
+  // Display summary
+  printf("#CUs\\Unroll");
+  for (int u : unrollList) {
+    printf("  %d(Min) ", u);
+    printf("  %d(Max) ", u);
+  }
+  printf("\n");
+  for (int c : numCusList) {
+    printf("   %5d   ", c);  fflush(stdout);
+    for (int u : unrollList) {
+      ev.gfxUnroll = cfg.gfx.unrollFactor = u;
+      for (auto& transfer : transfers)
+        transfer.numSubExecs = useSpray ? (c * targetCount) : c;
+
+      double minBandwidth = std::numeric_limits<double>::max();
+      double maxBandwidth = std::numeric_limits<double>::min();
+      TransferBench::TestResults result;
+      if (TransferBench::RunTransfers(cfg, transfers, result)) {
+        for (auto const& exeResult : result.exeResults) {
+          minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
+	  maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
+	}
+        if (useSpray) {
+	  minBandwidth *= targetCount;
+	  maxBandwidth *= targetCount;
+	}
+        results[std::make_pair(c,u)] = result;
+      } else {
+        minBandwidth = 0.0;
+      }
+      printf(" %7.2f  %7.2f ", minBandwidth, maxBandwidth); fflush(stdout);
+    }
+    printf("\n"); fflush(stdout);
+  }
+
+  if (verbose) {
+    int testNum = 0;
+    for (int c : numCusList) {
+      for (int u : unrollList) {
+        printf("CUs: %d Unroll %d\n", c, u);
+        PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
+      }
+    }
+  }
+}
--- a/src/client/Presets/Presets.hpp
+++ b/src/client/Presets/Presets.hpp
@@ -24,6 +24,8 @@ THE SOFTWARE.

 // Included after EnvVars and Executors
 #include "AllToAll.hpp"
+#include "AllToAllN.hpp"
+#include "AllToAllSweep.hpp"
 #include "HealthCheck.hpp"
 #include "OneToAll.hpp"
 #include "PeerToPeer.hpp"
@@ -38,14 +40,16 @@ typedef void (*PresetFunc)(EnvVars&          ev,

 std::map<std::string, std::pair<PresetFunc, std::string>> presetFuncMap =
 {
-  {"a2a",         {AllToAllPreset,   "Tests parallel transfers between all pairs of GPU devices"}},
-  {"healthcheck", {HealthCheckPreset,"Simple bandwidth health check (MI300X series only)"}},
-  {"one2all",     {OneToAllPreset,   "Test all subsets of parallel transfers from one GPU to all others"}},
-  {"p2p"   ,      {PeerToPeerPreset, "Peer-to-peer device memory bandwidth test"}},
-  {"rsweep",      {SweepPreset,      "Randomly sweep through sets of Transfers"}},
-  {"scaling",     {ScalingPreset,    "Run scaling test from one GPU to other devices"}},
-  {"schmoo",      {SchmooPreset,     "Scaling tests for local/remote read/write/copy"}},
-  {"sweep",       {SweepPreset,      "Ordered sweep through sets of Transfers"}},
+  {"a2a",         {AllToAllPreset,      "Tests parallel transfers between all pairs of GPU devices"}},
+  {"a2a_n",       {AllToAllRdmaPreset,  "Tests parallel transfers between all pairs of GPU devices using Nearest NIC RDMA transfers"}},
+  {"a2asweep",    {AllToAllSweepPreset, "Test GFX-based all-to-all transfers swept across different CU and GFX unroll counts"}},
+  {"healthcheck", {HealthCheckPreset,   "Simple bandwidth health check (MI300X series only)"}},
+  {"one2all",     {OneToAllPreset,      "Test all subsets of parallel transfers from one GPU to all others"}},
+  {"p2p"   ,      {PeerToPeerPreset,    " Peer-to-peer device memory bandwidth test"}},
+  {"rsweep",      {SweepPreset,         "Randomly sweep through sets of Transfers"}},
+  {"scaling",     {ScalingPreset,       "Run scaling test from one GPU to other devices"}},
+  {"schmoo",      {SchmooPreset,        "Scaling tests for local/remote read/write/copy"}},
+  {"sweep",       {SweepPreset,         "Ordered sweep through sets of Transfers"}},
 };

 void DisplayPresets()

--- a/src/client/Topology.hpp
+++ b/src/client/Topology.hpp
@@ -41,9 +41,9 @@ static int RemappedCpuIndex(int origIdx)
 static void PrintNicToGPUTopo(bool outputToCsv)
 {
 #ifdef NIC_EXEC_ENABLED
-  printf(" NIC | Device Name | Active | PCIe Bus ID  | NUMA | Closest GPU(s)\n");
+  printf(" NIC | Device Name | Active | PCIe Bus ID  | NUMA | Closest GPU(s) | GID Index | GID Descriptor\n");
  if(!outputToCsv)
-    printf("-----+-------------+--------+--------------+------+---------------\n");
+    printf("-----+-------------+--------+--------------+------+----------------+-----------+-------------------\n");

  int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
  auto const& ibvDeviceList = GetIbvDeviceList();
@@ -57,12 +57,15 @@ static void PrintNicToGPUTopo(bool outputToCsv)
      }
    }

-    printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-20s\n",
+    printf(" %-3d | %-11s | %-6s | %-12s | %-4d | %-14s | %-9s | %-20s\n",
           i, ibvDeviceList[i].name.c_str(),
           ibvDeviceList[i].hasActivePort ? "Yes" : "No",
           ibvDeviceList[i].busId.c_str(),
           ibvDeviceList[i].numaNode,
-           closestGpusStr.c_str());
+           closestGpusStr.c_str(),
+           ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort?  std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A",
+           ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort?  ibvDeviceList[i].gidDescriptor.c_str() : "N/A"
+          );
  }
  printf("\n");
 #endif

--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -22,11 +22,13 @@ THE SOFTWARE.

 /// @cond
 #pragma once
+#include <algorithm>
 #include <cstring>
 #include <future>
 #include <map>
 #include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
 #include <numaif.h>
+#include <random>
 #include <set>
 #include <sstream>
 #include <stdarg.h>
@@ -64,7 +66,7 @@ namespace TransferBench
  using std::set;
  using std::vector;

-  constexpr char VERSION[] = "1.60";
+  constexpr char VERSION[] = "1.61";

  /**
   * Enumeration of supported Executor types
@@ -173,6 +175,7 @@ namespace TransferBench
   */
  struct GfxOptions
  {
+    int                 blockOrder     = 0;     ///< Determines how threadblocks are ordered (0=sequential, 1=interleaved, 2=random)
    int                 blockSize      = 256;   ///< Size of each threadblock (must be multiple of 64)
    vector<uint32_t>    cuMask         = {};    ///< Bit-vector representing the CU mask
    vector<vector<int>> prefXccTable   = {};    ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
@@ -181,6 +184,7 @@ namespace TransferBench
    int                 useMultiStream = 0;     ///< Use multiple streams for GFX
    int                 useSingleTeam  = 0;     ///< Team all subExecutors across the data array
    int                 waveOrder      = 0;     ///< GFX-kernel wavefront ordering
+    int                 wordSize       = 4;     ///< GFX-kernel packed data size (4=dwordx4, 2=dwordx2, 1=dwordx1)
  };

  /**
@@ -233,6 +237,31 @@ namespace TransferBench
    ERR_FATAL = 2,                              ///< Fatal error - results are invalid
  };

+  /**
+   * Enumeration of GID priority
+   *
+   * @note These are the GID types ordered in priority from lowest (0) to highest
+   */
+  enum GidPriority
+  {
+    UNKNOWN           = -1,                      ///< Default
+    ROCEV1_LINK_LOCAL = 0,                       ///< RoCEv1 Link-local
+    ROCEV2_LINK_LOCAL = 1,                       ///< RoCEv2 Link-local fe80::/10
+    ROCEV1_IPV6       = 2,                       ///< RoCEv1 IPv6
+    ROCEV2_IPV6       = 3,                       ///< RoCEv2 IPv6
+    ROCEV1_IPV4       = 4,                       ///< RoCEv1 IPv4-mapped IPv6
+    ROCEV2_IPV4       = 5,                       ///< RoCEv2 IPv4-mapped IPv6 ::ffff:192.168.x.x
+  };
+
+  const char* GidPriorityStr[] = {
+    "RoCEv1 Link-local",
+    "RoCEv2 Link-local",
+    "RoCEv1 IPv6",
+    "RoCEv2 IPv6",
+    "RoCEv1 IPv4-mapped IPv6",
+    "RoCEv2 IPv4-mapped IPv6"
+  };
+
  /**
   * ErrResult consists of error type and error message
   */
@@ -463,6 +492,14 @@ namespace TransferBench
  #define hipStreamDestroy                                   cudaStreamDestroy
  #define hipStreamSynchronize                               cudaStreamSynchronize

+  // Define float2 addition operator for NVIDIA platform
+  __device__ inline float2& operator +=(float2& a, const float2& b)
+  {
+    a.x += b.x;
+    a.y += b.y;
+    return a;
+  }
+
  // Define float4 addition operator for NVIDIA platform
  __device__ inline float4& operator +=(float4& a, const float4& b)
  {
@@ -924,6 +961,13 @@ namespace {
      errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});

    // Check GFX options
+    if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2)
+      errors.push_back({ERR_FATAL,
+          "[gfx.blockOrder] must be 0 for sequential, 1 for interleaved, or 2 for random"});
+
+    if (cfg.gfx.useMultiStream && cfg.gfx.blockOrder > 0)
+      errors.push_back({ERR_WARN, "[gfx.blockOrder] will be ignored when running in multi-stream mode"});
+
    int gfxMaxBlockSize = GetIntAttribute(ATR_GFX_MAX_BLOCKSIZE);
    if (cfg.gfx.blockSize < 0 || cfg.gfx.blockSize % 64 || cfg.gfx.blockSize > gfxMaxBlockSize)
      errors.push_back({ERR_FATAL,
@@ -939,6 +983,9 @@ namespace {
      errors.push_back({ERR_FATAL,
                        "[gfx.waveOrder] must be non-negative and less than 6"});

+    if (!(cfg.gfx.wordSize == 1 || cfg.gfx.wordSize == 2 || cfg.gfx.wordSize == 4))
+      errors.push_back({ERR_FATAL, "[gfx.wordSize] must be either 1, 2 or 4"});
+
    int numGpus = GetNumExecutors(EXE_GPU_GFX);
    int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0});
    vector<vector<int>> const& table = cfg.gfx.prefXccTable;
@@ -1395,12 +1442,122 @@ namespace {
    std::string busId;
    bool        hasActivePort;
    int         numaNode;
+    int         gidIndex;
+    std::string gidDescriptor;
+    bool        isRoce;
  };
 #endif

 #ifdef NIC_EXEC_ENABLED
 // Function to collect information about IBV devices
 //========================================================================================
+static bool IsConfiguredGid(union ibv_gid const& gid)
+  {
+    const struct in6_addr *a = (struct in6_addr *) gid.raw;
+    int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
+    if (((a->s6_addr32[0] | trailer) == 0UL) ||
+        ((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
+      return false;
+    }
+    return true;
+  }
+
+  static bool LinkLocalGid(union ibv_gid const& gid)
+  {
+    const struct in6_addr *a = (struct in6_addr *) gid.raw;
+    if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
+      return true;
+    }
+    return false;
+  }
+
+  static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
+                                        int const&  portNum,
+                                        int const&  gidIndex,
+                                        int&        version)
+  {
+    char const* deviceName = ibv_get_device_name(context->device);
+    char gidRoceVerStr[16]      = {};
+    char roceTypePath[PATH_MAX] = {};
+    sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
+            deviceName, portNum, gidIndex);
+
+    int fd = open(roceTypePath, O_RDONLY);
+    if (fd == -1)
+      return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath};
+
+    int ret = read(fd, gidRoceVerStr, 15);
+    close(fd);
+
+    if (ret == -1)
+      return {ERR_FATAL, "Failed while reading RoCE version"};
+
+    if (strlen(gidRoceVerStr)) {
+      if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0
+          || strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
+        version = 1;
+      }
+      else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
+        version = 2;
+      }
+    }
+    return ERR_NONE;
+  }
+
+  static bool IsIPv4MappedIPv6(const union ibv_gid &gid)
+  {
+    // look for ::ffff:x.x.x.x format
+    // From Broadcom documentation
+    // https://techdocs.broadcom.com/us/en/storage-and-ethernet-connectivity/ethernet-nic-controllers/bcm957xxx/adapters/frequently-asked-questions1.html
+    // "The IPv4 address is really an IPv4 address mapped into the IPv6 address space.
+    // This can be identified by 80 “0” bits, followed by 16 “1” bits (“FFFF” in hexadecimal)
+    // followed by the original 32-bit IPv4 address."
+    return (gid.global.subnet_prefix == 0    &&
+            gid.raw[8]               == 0    &&
+            gid.raw[9]               == 0    &&
+            gid.raw[10]              == 0xff &&
+            gid.raw[11]              == 0xff);
+  }
+
+  static ErrResult GetGidIndex(struct ibv_context*          context,
+                               int const&                   gidTblLen,
+                               int const&                   portNum,
+                               std::pair<int, std::string>& gidInfo)
+  {
+    if(gidInfo.first >= 0) return ERR_NONE; // honor user choice
+    union ibv_gid gid;
+
+    GidPriority highestPriority = GidPriority::UNKNOWN;
+    int gidIndex = -1;
+
+    for (int i = 0; i < gidTblLen; ++i) {
+      IBV_CALL(ibv_query_gid, context, portNum, i, &gid);
+      if (!IsConfiguredGid(gid)) continue;
+      int gidCurrRoceVersion;
+      if(GetRoceVersionNumber(context, portNum, i, gidCurrRoceVersion).errType != ERR_NONE) continue;
+      GidPriority currPriority;
+      if (IsIPv4MappedIPv6(gid)) {
+        currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV4 : GidPriority::ROCEV1_IPV4;
+      } else if (!LinkLocalGid(gid)) {
+        currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_IPV6 : GidPriority::ROCEV1_IPV6;
+      } else {
+        currPriority = (gidCurrRoceVersion == 2) ? GidPriority::ROCEV2_LINK_LOCAL : GidPriority::ROCEV1_LINK_LOCAL;
+      }
+      if(currPriority > highestPriority) {
+        highestPriority = currPriority;
+        gidIndex = i;
+      }
+    }
+
+    if (highestPriority == GidPriority::UNKNOWN) {
+      gidInfo.first = -1;
+      return {ERR_FATAL, "Failed to auto-detect a valid GID index. Try setting it manually through IB_GID_INDEX"};
+    }
+    gidInfo.first = gidIndex;
+    gidInfo.second = GidPriorityStr[highestPriority];
+    return ERR_NONE;
+  }
+
  static vector<IbvDevice>& GetIbvDeviceList()
  {
    static bool isInitialized = false;
@@ -1425,12 +1582,25 @@ namespace {
            if (context) {
              struct ibv_device_attr deviceAttr;
              if (!ibv_query_device(context, &deviceAttr)) {
+                int activePort;
+                ibvDevice.gidIndex = -1;
                for (int port = 1; port <= deviceAttr.phys_port_cnt; ++port) {
                  struct ibv_port_attr portAttr;
                  if (ibv_query_port(context, port, &portAttr)) continue;
-                  if (portAttr.state == IBV_PORT_ACTIVE)
+                  if (portAttr.state == IBV_PORT_ACTIVE) {
+                    activePort = port;
                    ibvDevice.hasActivePort = true;
-                  break;
+                    if(portAttr.link_layer == IBV_LINK_LAYER_ETHERNET) {
+                      ibvDevice.isRoce = true;
+                      std::pair<int, std::string> gidInfo (-1, "");
+                      auto res = GetGidIndex(context, portAttr.gid_tbl_len, activePort, gidInfo);
+                      if (res.errType == ERR_NONE) {
+                        ibvDevice.gidIndex = gidInfo.first;
+                        ibvDevice.gidDescriptor = gidInfo.second;
+                      }
+                    }
+                    break;
+                  }
                }
              }
              ibv_close_device(context);
@@ -1781,164 +1951,6 @@ namespace {
    return ERR_NONE;
  }

-  static bool IsConfiguredGid(union ibv_gid* gid)
-  {
-    const struct in6_addr *a = (struct in6_addr *)gid->raw;
-    int trailer = (a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]);
-    if (((a->s6_addr32[0] | trailer) == 0UL) ||
-        ((a->s6_addr32[0] == htonl(0xfe800000)) && (trailer == 0UL))) {
-      return false;
-    }
-    return true;
-  }
-
-  static bool LinkLocalGid(union ibv_gid* gid)
-  {
-    const struct in6_addr *a = (struct in6_addr *)gid->raw;
-    if (a->s6_addr32[0] == htonl(0xfe800000) && a->s6_addr32[1] == 0UL) {
-      return true;
-    }
-    return false;
-  }
-
-  static bool IsValidGid(union ibv_gid* gid)
-  {
-    return (IsConfiguredGid(gid) && !LinkLocalGid(gid));
-  }
-
-  static sa_family_t GetGidAddressFamily(union ibv_gid* gid)
-  {
-    const struct in6_addr *a = (struct in6_addr *)gid->raw;
-    bool isIpV4Mapped = ((a->s6_addr32[0] | a->s6_addr32[1]) |
-                         (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL;
-    bool isIpV4MappedMulticast = (a->s6_addr32[0] == htonl(0xff0e0000) &&
-                                  ((a->s6_addr32[1] |
-                                    (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0UL));
-    return (isIpV4Mapped || isIpV4MappedMulticast) ? AF_INET : AF_INET6;
-  }
-
-  static bool MatchGidAddressFamily(sa_family_t const& af,
-                                    void*              prefix,
-                                    int                prefixLen,
-                                    union ibv_gid*     gid)
-  {
-    struct in_addr  *base  = NULL;
-    struct in6_addr *base6 = NULL;
-    struct in6_addr *addr6 = NULL;;
-    if (af == AF_INET) {
-      base = (struct in_addr *)prefix;
-    } else {
-      base6 = (struct in6_addr *)prefix;
-    }
-    addr6 = (struct in6_addr *)gid->raw;
-#define NETMASK(bits) (htonl(0xffffffff ^ ((1 << (32 - bits)) - 1)))
-    int i = 0;
-    while (prefixLen > 0 && i < 4) {
-      if (af == AF_INET) {
-        int mask = NETMASK(prefixLen);
-        if ((base->s_addr & mask) ^ (addr6->s6_addr32[3] & mask))
-          break;
-        prefixLen = 0;
-        break;
-      } else {
-        if (prefixLen >= 32) {
-          if (base6->s6_addr32[i] ^ addr6->s6_addr32[i])
-            break;
-          prefixLen -= 32;
-          ++i;
-        } else {
-          int mask = NETMASK(prefixLen);
-          if ((base6->s6_addr32[i] & mask) ^ (addr6->s6_addr32[i] & mask))
-            break;
-          prefixLen = 0;
-        }
-      }
-    }
-    return (prefixLen == 0) ? true : false;
-#undef NETMASK
-  }
-
-  static ErrResult GetRoceVersionNumber(struct ibv_context* const& context,
-                                        int const&  portNum,
-                                        int const&  gidIndex,
-                                        int*        version)
-  {
-    char const* deviceName = ibv_get_device_name(context->device);
-    char gidRoceVerStr[16]      = {};
-    char roceTypePath[PATH_MAX] = {};
-    sprintf(roceTypePath, "/sys/class/infiniband/%s/ports/%d/gid_attrs/types/%d",
-            deviceName, portNum, gidIndex);
-
-    int fd = open(roceTypePath, O_RDONLY);
-    if (fd == -1)
-      return {ERR_FATAL, "Failed while opening RoCE file path (%s)", roceTypePath};
-
-    int ret = read(fd, gidRoceVerStr, 15);
-    close(fd);
-
-    if (ret == -1)
-      return {ERR_FATAL, "Failed while reading RoCE version"};
-
-    if (strlen(gidRoceVerStr)) {
-      if (strncmp(gidRoceVerStr, "IB/RoCE v1", strlen("IB/RoCE v1")) == 0
-          || strncmp(gidRoceVerStr, "RoCE v1", strlen("RoCE v1")) == 0) {
-        *version = 1;
-      }
-      else if (strncmp(gidRoceVerStr, "RoCE v2", strlen("RoCE v2")) == 0) {
-        *version = 2;
-      }
-    }
-    return ERR_NONE;
-  }
-
-  static ErrResult GetGidIndex(ConfigOptions const& cfg,
-                               struct ibv_context*  context,
-                               int const&           gidTblLen,
-                               int&                 gidIndex)
-  {
-    // Use GID index if user specified
-    if (gidIndex >= 0) return ERR_NONE;
-
-    // Try to find the best GID index
-    int         port          = cfg.nic.ibPort;
-    sa_family_t targetAddrFam = (cfg.nic.ipAddressFamily == 6)? AF_INET6 : AF_INET;
-    int         targetRoCEVer = cfg.nic.roceVersion;
-
-    // Initially assume gidIndex = 0
-    int gidIndexCurr = 0;
-    union ibv_gid gidCurr;
-    IBV_CALL(ibv_query_gid, context, port, gidIndexCurr, &gidCurr);
-    sa_family_t gidCurrFam = GetGidAddressFamily(&gidCurr);
-    bool gidCurrIsValid = IsValidGid(&gidCurr);
-    int  gidCurrRoceVersion;
-    ERR_CHECK(GetRoceVersionNumber(context, port, gidIndexCurr, &gidCurrRoceVersion));
-
-    // Loop over GID table to find the best match
-    for (int gidIndexTest = 1; gidIndexTest < gidTblLen; ++gidIndexTest) {
-      union ibv_gid gidTest;
-      IBV_CALL(ibv_query_gid, context, cfg.nic.ibPort, gidIndexTest, &gidTest);
-      if (!IsValidGid(&gidTest)) continue;
-
-      sa_family_t gidTestFam = GetGidAddressFamily(&gidTest);
-      bool gidTestMatchSubnet = MatchGidAddressFamily(targetAddrFam, NULL, 0, &gidTest);
-      int  gidTestRoceVersion;
-      ERR_CHECK(GetRoceVersionNumber(context, port, gidIndexTest, &gidTestRoceVersion));
-
-      if (!gidCurrIsValid ||
-          (gidTestFam == targetAddrFam && gidTestMatchSubnet &&
-           (gidCurrFam != targetAddrFam || gidTestRoceVersion == targetRoCEVer))) {
-        // Switch to better match
-        gidIndexCurr       = gidIndexTest;
-        gidCurrFam         = gidTestFam;
-        gidCurrIsValid     = true;
-        gidCurrRoceVersion = gidTestRoceVersion;
-      }
-    }
-
-    gidIndex = gidIndexCurr;
-    return ERR_NONE;
-  }
-
  static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg,
                                               ExeDevice     const& srcExeDevice,
                                               Transfer      const& t,
@@ -2012,8 +2024,12 @@ namespace {
    bool isRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET);
    if (isRoCE) {
      // Try to auto-detect the GID index
-      ERR_CHECK(GetGidIndex(cfg, rss.srcContext, rss.srcPortAttr.gid_tbl_len, srcGidIndex));
-      ERR_CHECK(GetGidIndex(cfg, rss.dstContext, rss.dstPortAttr.gid_tbl_len, dstGidIndex));
+      std::pair<int, std::string> srcGidInfo (srcGidIndex, "");
+      std::pair<int, std::string> dstGidInfo (dstGidIndex, "");
+      ERR_CHECK(GetGidIndex(rss.srcContext, rss.srcPortAttr.gid_tbl_len, cfg.nic.ibPort, srcGidInfo));
+      ERR_CHECK(GetGidIndex(rss.dstContext, rss.dstPortAttr.gid_tbl_len, cfg.nic.ibPort, dstGidInfo));
+      srcGidIndex = srcGidInfo.first;
+      dstGidIndex = dstGidInfo.first;
      IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid);
      IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid);
    }
@@ -2396,13 +2412,47 @@ namespace {
                                      exeDevice.exeIndex));
 #endif
      int transferOffset = 0;
-      for (auto& rss : exeInfo.resources) {
-        Transfer const& t = transfers[rss.transferIdx];
-        rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
-        for (auto p : rss.subExecParamCpu) {
+      if (cfg.gfx.useMultiStream || cfg.gfx.blockOrder == 0) {
+        // Threadblocks are ordered sequentially one transfer at a time
+        for (auto& rss : exeInfo.resources) {
+          Transfer const& t = transfers[rss.transferIdx];
+          rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
+          for (auto p : rss.subExecParamCpu) {
+            rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
+            exeInfo.subExecParamCpu.push_back(p);
+            transferOffset++;
+          }
+        }
+      } else if (cfg.gfx.blockOrder == 1) {
+        // Interleave threadblocks of different Transfers
+        for (int subExecIdx = 0; exeInfo.subExecParamCpu.size() < exeInfo.totalSubExecs; ++subExecIdx) {
+          for (auto& rss : exeInfo.resources) {
+            Transfer const& t = transfers[rss.transferIdx];
+            if (subExecIdx < t.numSubExecs) {
+              rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
+              exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[subExecIdx]);
+            }
+          }
+        }
+      } else if (cfg.gfx.blockOrder == 2) {
+        // Build randomized threadblock list
+        std::vector<std::pair<int,int>> indices;
+        for (int i = 0; i < exeInfo.resources.size(); i++) {
+          auto const& rss = exeInfo.resources[i];
+          Transfer const& t = transfers[rss.transferIdx];
+          for (int j = 0; j < t.numSubExecs; j++)
+            indices.push_back(std::make_pair(i,j));
+        }
+
+        std::random_device rd;
+        std::default_random_engine gen(rd());
+        std::shuffle(indices.begin(), indices.end(), gen);
+
+        // Build randomized threadblock list
+        for (auto p : indices) {
+          auto& rss = exeInfo.resources[p.first];
          rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
-          exeInfo.subExecParamCpu.push_back(p);
-          transferOffset++;
+          exeInfo.subExecParamCpu.push_back(rss.subExecParamCpu[p.second]);
        }
      }

@@ -2595,50 +2645,15 @@ namespace {
                                      int           const  exeIndex,
                                      TransferResources&   rss)
  {
-    auto cpuStart = std::chrono::high_resolution_clock::now();

-    // Switch to the closest NUMA node to this NIC
-    if (cfg.nic.useNuma) {
-      int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
-      if (numaNode != -1)
-        numa_run_on_node(numaNode);
-    }
-
-    int subIteration = 0;
-    do {
-      // Loop over each of the queue pairs and post the send
-      ibv_send_wr* badWorkReq;
-      for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
-        int error = ibv_post_send(rss.srcQueuePairs[qpIndex], &rss.sendWorkRequests[qpIndex], &badWorkReq);
-        if (error)
-          return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n",
-            rss.transferIdx, qpIndex, error};
-      }
-
-      // Poll the completion queue until all queue pairs are complete
-      // The order of completion doesn't matter because this completion queue is dedicated to this Transfer
-      int numComplete = 0;
-      ibv_wc wc;
-      while (numComplete < rss.qpCount) {
-        int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc);
-        if (nc > 0) {
-          numComplete++;
-          if (wc.status != IBV_WC_SUCCESS) {
-            return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion", rss.transferIdx};
-          }
-        } else if (nc < 0) {
-          return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx};
-        }
-      }
-    } while (++subIteration != cfg.general.numSubIterations);

-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-
-    if (iteration >= 0) {
-      rss.totalDurationMsec += deltaMsec;
-      if (cfg.general.recordPerIteration)
-        rss.perIterMsec.push_back(deltaMsec);
+    // Loop over each of the queue pairs and post the send
+    ibv_send_wr* badWorkReq;
+    for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
+      int error = ibv_post_send(rss.srcQueuePairs[qpIndex], &rss.sendWorkRequests[qpIndex], &badWorkReq);
+      if (error)
+        return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n",
+          rss.transferIdx, qpIndex, error};
    }
    return ERR_NONE;
  }
@@ -2649,26 +2664,59 @@ namespace {
                                  int           const  exeIndex,
                                  ExeInfo&             exeInfo)
  {
-    vector<std::future<ErrResult>> asyncTransfers;
-
-    auto cpuStart = std::chrono::high_resolution_clock::now();
-    for (int i = 0; i < exeInfo.resources.size(); i++) {
-      asyncTransfers.emplace_back(std::async(std::launch::async,
-                                             ExecuteNicTransfer,
-                                             iteration,
-                                             std::cref(cfg),
-                                             exeIndex,
-                                             std::ref(exeInfo.resources[i])));
+    // Switch to the closest NUMA node to this NIC
+    if (cfg.nic.useNuma) {
+      int numaNode = GetIbvDeviceList()[exeIndex].numaNode;
+      if (numaNode != -1)
+        numa_run_on_node(numaNode);
    }
-    for (auto& asyncTransfer : asyncTransfers)
-      ERR_CHECK(asyncTransfer.get());
-
-    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
-    double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
-
-    if (iteration >= 0)
-      exeInfo.totalDurationMsec += deltaMsec;
-
+    int subIterations = 0;
+    do {
+      auto cpuStart = std::chrono::high_resolution_clock::now();
+      size_t completedTransfers = 0;
+      auto transferCount = exeInfo.resources.size();
+      std::vector<uint8_t> receivedQPs(transferCount);
+      std::vector<std::chrono::high_resolution_clock::time_point> transferTimers(transferCount);
+      // post the sends
+      for (auto i = 0; i < transferCount; i++) {
+        transferTimers[i] = std::chrono::high_resolution_clock::now();
+        ERR_CHECK(ExecuteNicTransfer(iteration, cfg, exeIndex, exeInfo.resources[i]));
+      }
+      // poll for completions
+      do {
+        for (auto i = 0; i < transferCount; i++) {
+          if(receivedQPs[i] < exeInfo.resources[i].qpCount) {
+            auto& rss = exeInfo.resources[i];
+            // Poll the completion queue until all queue pairs are complete
+            // The order of completion doesn't matter because this completion queue is dedicated to this Transfer
+            ibv_wc wc;
+            int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc);
+            if (nc > 0) {
+              receivedQPs[i]++;
+              if (wc.status != IBV_WC_SUCCESS) {
+                return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion", rss.transferIdx};
+              }
+            } else if (nc < 0) {
+              return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx};
+            }
+            if(receivedQPs[i] == rss.qpCount) {
+              auto cpuDelta = std::chrono::high_resolution_clock::now() - transferTimers[i];
+              double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+              if (iteration >= 0) {
+                rss.totalDurationMsec += deltaMsec;
+                if (cfg.general.recordPerIteration)
+                  rss.perIterMsec.push_back(deltaMsec);
+              }
+              completedTransfers++;
+            }
+          }
+        }
+      } while(completedTransfers < transferCount);
+      auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+      double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
+      if (iteration >= 0)
+        exeInfo.totalDurationMsec += deltaMsec;
+    } while(++subIterations < cfg.general.numSubIterations);
    return ERR_NONE;
  }
 #endif
@@ -2704,13 +2752,16 @@ namespace {
  // Helper function for memset
  template <typename T> __device__ __forceinline__ T      MemsetVal();
  template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
+  template <>           __device__ __forceinline__ float2 MemsetVal(){ return make_float2(MEMSET_VAL,
+                                                                                          MEMSET_VAL); };
  template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL,
                                                                                          MEMSET_VAL,
                                                                                          MEMSET_VAL,
                                                                                          MEMSET_VAL); }

-  // Kernel for GFX execution
-  template <int BLOCKSIZE, int UNROLL>
+
+// Kernel for GFX execution
+  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL>
  __global__ void __launch_bounds__(BLOCKSIZE)
    GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
  {
@@ -2729,10 +2780,10 @@ namespace {
    // Collect data information
    int32_t const  numSrcs  = p.numSrcs;
    int32_t const  numDsts  = p.numDsts;
-    float4  const* __restrict__ srcFloat4[MAX_SRCS];
-    float4*        __restrict__ dstFloat4[MAX_DSTS];
-    for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i];
-    for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i];
+    PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
+    PACKED_FLOAT*       __restrict__ dstFloatPacked[MAX_DSTS];
+    for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i];
+    for (int i = 0; i < numDsts; i++) dstFloatPacked[i] = (PACKED_FLOAT*)p.dst[i];

    // Operate on wavefront granularity
    int32_t const nTeams   = p.teamSize;             // Number of threadblocks working together on this subarray
@@ -2741,7 +2792,7 @@ namespace {
    int32_t const waveIdx  = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
    int32_t const tIdx     = threadIdx.x % warpSize; // Thread index within wavefront

-    size_t  const numFloat4 = p.N / 4;
+    size_t  const numPackedFloat = p.N / (sizeof(PACKED_FLOAT)/sizeof(float));

    int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
    switch (waveOrder) {
@@ -2755,64 +2806,64 @@ namespace {

    int subIterations = 0;
    while (1) {
-      // First loop: Each wavefront in the team works on UNROLL float4s per thread
+      // First loop: Each wavefront in the team works on UNROLL PACKED_FLOAT per thread
      size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
-      size_t const loop1Limit  = numFloat4 / loop1Stride * loop1Stride;
+      size_t const loop1Limit  = numPackedFloat / loop1Stride * loop1Stride;
      {
-        float4 val[UNROLL];
+        PACKED_FLOAT val[UNROLL];
        if (numSrcs == 0) {
          #pragma unroll
          for (int u = 0; u < UNROLL; u++)
-            val[u] = MemsetVal<float4>();
+            val[u] = MemsetVal<PACKED_FLOAT>();
        }

        for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride) {
          // Read sources into memory and accumulate in registers
          if (numSrcs) {
            for (int u = 0; u < UNROLL; u++)
-              val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
+              val[u] = srcFloatPacked[0][idx + u * unrlStride * warpSize];
            for (int s = 1; s < numSrcs; s++)
              for (int u = 0; u < UNROLL; u++)
-                val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
+                val[u] += srcFloatPacked[s][idx + u * unrlStride * warpSize];
          }

          // Write accumulation to all outputs
          for (int d = 0; d < numDsts; d++) {
            #pragma unroll
            for (int u = 0; u < UNROLL; u++)
-              dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
+              dstFloatPacked[d][idx + u * unrlStride * warpSize] = val[u];
          }
        }
      }

-      // Second loop: Deal with remaining float4s
+      // Second loop: Deal with remaining PACKED_FLOAT
      {
-        if (loop1Limit < numFloat4) {
-          float4 val;
-          if (numSrcs == 0) val = MemsetVal<float4>();
+        if (loop1Limit < numPackedFloat) {
+          PACKED_FLOAT val;
+          if (numSrcs == 0) val = MemsetVal<PACKED_FLOAT>();

          size_t const loop2Stride = nTeams * nWaves * warpSize;
          for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx;
-               idx < numFloat4; idx += loop2Stride) {
+               idx < numPackedFloat; idx += loop2Stride) {
            if (numSrcs) {
-              val = srcFloat4[0][idx];
+              val = srcFloatPacked[0][idx];
              for (int s = 1; s < numSrcs; s++)
-                val += srcFloat4[s][idx];
+                val += srcFloatPacked[s][idx];
            }
            for (int d = 0; d < numDsts; d++)
-              dstFloat4[d][idx] = val;
+              dstFloatPacked[d][idx] = val;
          }
        }
      }

      // Third loop; Deal with remaining floats
      {
-        if (numFloat4 * 4 < p.N) {
+        if (numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) < p.N) {
          float val;
          if (numSrcs == 0) val = MemsetVal<float>();

          size_t const loop3Stride = nTeams * nWaves * warpSize;
-          for ( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
+          for (size_t idx = numPackedFloat * (sizeof(PACKED_FLOAT)/sizeof(float)) + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride) {
            if (numSrcs) {
              val = p.src[0][idx];
              for (int s = 1; s < numSrcs; s++)
@@ -2839,19 +2890,24 @@ namespace {
    }
  }

-#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)   \
-    {GpuReduceKernel<BLOCKSIZE, 1>,         \
-     GpuReduceKernel<BLOCKSIZE, 2>,         \
-     GpuReduceKernel<BLOCKSIZE, 3>,         \
-     GpuReduceKernel<BLOCKSIZE, 4>,         \
-     GpuReduceKernel<BLOCKSIZE, 5>,         \
-     GpuReduceKernel<BLOCKSIZE, 6>,         \
-     GpuReduceKernel<BLOCKSIZE, 7>,         \
-     GpuReduceKernel<BLOCKSIZE, 8>}
+#define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL) \
+  {GpuReduceKernel<float,  BLOCKSIZE, UNROLL>,   \
+   GpuReduceKernel<float2, BLOCKSIZE, UNROLL>,   \
+   GpuReduceKernel<float4, BLOCKSIZE, UNROLL>}

-  // Table of all GPU Reduction kernel functions (templated blocksize / unroll)
+#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE)    \
+  {GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1),      \
+   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 2),      \
+   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 3),      \
+   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 4),      \
+   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 5),      \
+   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 6),      \
+   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7),      \
+   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)}
+
+  // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
  typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
-  GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] =
+  GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3] =
  {
    GPU_KERNEL_UNROLL_DECL(64),
    GPU_KERNEL_UNROLL_DECL(128),
@@ -2879,18 +2935,19 @@ namespace {
    dim3 const gridSize(xccDim, numSubExecs, 1);
    dim3 const blockSize(cfg.gfx.blockSize, 1);

+    int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
+                      cfg.gfx.wordSize == 2 ? 1 :
+                                              2;
+    auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
+
 #if defined(__NVCC__)
    if (startEvent != NULL)
      ERR_CHECK(hipEventRecord(startEvent, stream));
-
-    GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
-      <<<gridSize, blockSize, 0, stream>>>
-      (rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+    gpuKernel<<<gridSize, blockSize, 0, stream>>>(rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
    if (stopEvent != NULL)
      ERR_CHECK(hipEventRecord(stopEvent, stream));
 #else
-    hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
-                          gridSize, blockSize, 0, stream, startEvent, stopEvent,
+    hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, startEvent, stopEvent,
                          0, rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
 #endif

@@ -2954,19 +3011,19 @@ namespace {
      dim3 const blockSize(cfg.gfx.blockSize, 1);
      hipStream_t stream = exeInfo.streams[0];

+      int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
+                        cfg.gfx.wordSize == 2 ? 1 :
+                                                2;
+      auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx];
+
 #if defined(__NVCC__)
      if (cfg.gfx.useHipEvents)
        ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream));
-
-      GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1]
-        <<<gridSize, blockSize, 0 , stream>>>
-        (exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
-
+      gpuKernel<<<gridSize, blockSize, 0 , stream>>>(exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
      if (cfg.gfx.useHipEvents)
        ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream));
 #else
-      hipExtLaunchKernelGGL(GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1],
-                            gridSize, blockSize, 0, stream,
+      hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream,
                            cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
                            cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0,
                            exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);