V1.45 candidate (#78)

* v1.45 New GFX kernel

V1.45 candidate (#78)
* v1.45 New GFX kernel
f33c7fd9 · gilbertlee-amd · GitHub · 33a5435c · f33c7fd9 · f33c7fd9
Unverified Commit f33c7fd9 authored Dec 04, 2023 by gilbertlee-amd Committed by GitHub Dec 04, 2023
Showing with 264 additions and 326 deletions

CHANGELOG.md CHANGELOG.md +10 -0

src/TransferBench.cpp src/TransferBench.cpp +73 -36

src/include/EnvVars.hpp src/include/EnvVars.hpp +68 -25

src/include/Kernels.hpp src/include/Kernels.hpp +113 -265

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,16 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.45
+
+### Additions
+* Adding A2A_MODE to a2a preset (0 = copy, 1 = read-only, 2 = write-only)
+* Adding GFX_UNROLL to modify GFX kernel's unroll factor
+* Adding GFX_WAVE_ORDER to modify order in which wavefronts process data
+
+### Modifications
+* Rewrote the GFX reduction kernel to support new wave ordering
+
 ## v1.44

 ### Additions

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -158,7 +158,7 @@ int main(int argc, char **argv)
    }
    ev.DisplayRemoteWriteEnvVars();

-    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8);
+    int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
    int srcIdx      = (argc > 4 ? atoi(argv[4]) : 0);
    int minGpus     = (argc > 5 ? atoi(argv[5]) : 1);
    int maxGpus     = (argc > 6 ? atoi(argv[6]) : std::min(ev.numGpuDevices - 1, 3));
@@ -611,16 +611,26 @@ void ExecuteTransfers(EnvVars const& ev,
        transfer->executorBandwidth = exeBandwidthGbs;
        totalCUs += transfer->numSubExecs;

+        char exeSubIndexStr[32] = "";
+        if (ev.useXccFilter)
+        {
+          if (transfer->exeSubIndex == -1)
+            sprintf(exeSubIndexStr, ".*");
+          else
+            sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
+        }
+
        if (!verbose) continue;
        if (!ev.outputToCsv)
        {
-          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+          printf("     Transfer %02d  | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
                 transfer->transferIndex,
                 transfer->transferBandwidth,
                 transfer->transferTime,
                 transfer->numBytesActual,
                 transfer->SrcToStr().c_str(),
                 ExeTypeName[transfer->exeType], transfer->exeIndex,
+                 exeSubIndexStr,
                 transfer->numSubExecs,
                 transfer->DstToStr().c_str());

@@ -668,10 +678,10 @@ void ExecuteTransfers(EnvVars const& ev,
        }
        else
        {
-          printf("%d,%d,%lu,%s,%c%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+          printf("%d,%d,%lu,%s,%c%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
                 testNum, transfer->transferIndex, transfer->numBytesActual,
                 transfer->SrcToStr().c_str(),
-                 MemTypeStr[transfer->exeType], transfer->exeIndex,
+                 MemTypeStr[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
                 transfer->DstToStr().c_str(),
                 transfer->numSubExecs,
                 transfer->transferBandwidth, transfer->transferTime,
@@ -699,14 +709,24 @@ void ExecuteTransfers(EnvVars const& ev,
      transfer->executorBandwidth = transfer->transferBandwidth;
      maxGpuTime = std::max(maxGpuTime, transfer->transferTime);
      if (!verbose) continue;
+
+      char exeSubIndexStr[32] = "";
+      if (ev.useXccFilter)
+      {
+        if (transfer->exeSubIndex == -1)
+          sprintf(exeSubIndexStr, ".*");
+        else
+          sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
+      }
+
      if (!ev.outputToCsv)
      {
-        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
+        printf(" Transfer %02d      | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
               transfer->transferIndex,
               transfer->transferBandwidth, transfer->transferTime,
               transfer->numBytesActual,
               transfer->SrcToStr().c_str(),
-               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
               transfer->numSubExecs,
               transfer->DstToStr().c_str());

@@ -753,10 +773,10 @@ void ExecuteTransfers(EnvVars const& ev,
      }
      else
      {
-        printf("%d,%d,%lu,%s,%s%02d,%s,%d,%.3f,%.3f,%s,%s\n",
+        printf("%d,%d,%lu,%s,%s%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
               testNum, transfer->transferIndex, transfer->numBytesActual,
               transfer->SrcToStr().c_str(),
-               ExeTypeName[transfer->exeType], transfer->exeIndex,
+               ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
               transfer->DstToStr().c_str(),
               transfer->numSubExecs,
               transfer->transferBandwidth, transfer->transferTime,
@@ -1436,15 +1456,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,

 #if defined(__NVCC__)
    HIP_CALL(hipEventRecord(startEvent, stream));
-    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
+    GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1]
+      <<<numBlocksToRun, ev.gfxBlockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.waveOrder);
    HIP_CALL(hipEventRecord(stopEvent, stream));
 #else
-    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
+    hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1],
                          dim3(numXCCs, numBlocksToRun, 1),
-                          dim3(ev.blockSize, 1, 1),
+                          dim3(ev.gfxBlockSize, 1, 1),
                          ev.sharedMemBytes, stream,
                          startEvent, stopEvent,
-                          0, transfer->subExecParamGpuPtr);
+                          0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
 #endif
    // Synchronize per iteration, unless in single sync mode, in which case
    // synchronize during last warmup / last actual iteration
@@ -1947,8 +1968,8 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
  Transfer transfer;
  transfer.numBytes    = numBytesPerTransfer;
  transfer.numSubExecs = numSubExecs;
-  transfer.numSrcs     = 1;
-  transfer.numDsts     = 1;
+  transfer.numSrcs     = ev.a2aMode == 2 ? 0 : 1;
+  transfer.numDsts     = ev.a2aMode == 1 ? 0 : 1;
  transfer.exeType     = EXE_GPU_GFX;
  transfer.exeSubIndex = -1;
  transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
@@ -2065,22 +2086,35 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)

  size_t assigned = 0;
  for (int i = 0; i < this->numSubExecs; ++i)
+  {
+    SubExecParam& p  = this->subExecParam[i];
+    p.numSrcs        = this->numSrcs;
+    p.numDsts        = this->numDsts;
+
+    if (ev.gfxSingleTeam && this->exeType == EXE_GPU_GFX)
+    {
+      p.N           = N;
+      p.teamSize    = this->numSubExecs;
+      p.teamIdx     = i;
+      for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset;
+      for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset;
+    }
+    else
    {
      int    const subExecLeft = std::max(0, maxSubExecToUse - i);
      size_t const leftover    = N - assigned;
      size_t const roundedN    = (leftover + targetMultiple - 1) / targetMultiple;

-    SubExecParam& p = this->subExecParam[i];
      p.N           = subExecLeft ? std::min(leftover, ((roundedN / subExecLeft) * targetMultiple)) : 0;
-    p.numSrcs       = this->numSrcs;
-    p.numDsts       = this->numDsts;
-    for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc)
-      p.src[iSrc] = this->srcMem[iSrc] + assigned + initOffset;
-    for (int iDst = 0; iDst < this->numDsts; ++iDst)
-      p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
+      p.teamSize    = 1;
+      p.teamIdx     = 0;
+      for (int iSrc = 0; iSrc < this->numSrcs; ++iSrc) p.src[iSrc] = this->srcMem[iSrc] + initOffset + assigned;
+      for (int iDst = 0; iDst < this->numDsts; ++iDst) p.dst[iDst] = this->dstMem[iDst] + initOffset + assigned;

-    p.preferredXccId = -1;
+      assigned += p.N;
+    }

+    p.preferredXccId = -1;
    if (ev.useXccFilter && this->exeType == EXE_GPU_GFX)
    {
      std::uniform_int_distribution<int> distribution(0, ev.xccIdsPerDevice[this->exeIndex].size() - 1);
@@ -2109,7 +2143,6 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)

    p.startCycle = 0;
    p.stopCycle  = 0;
-    assigned += p.N;
  }

  this->transferTime = 0.0;
@@ -2178,7 +2211,7 @@ bool Transfer::PrepareSrc(EnvVars const& ev)
      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
      HIP_CALL(hipSetDevice(deviceIdx));
      if (ev.usePrepSrcKernel)
-        PrepSrcDataKernel<<<32, ev.blockSize>>>(srcPtr, N, srcIdx);
+        PrepSrcDataKernel<<<32, ev.gfxBlockSize>>>(srcPtr, N, srcIdx);
      else
        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
      HIP_CALL(hipDeviceSynchronize());
@@ -2424,17 +2457,22 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
  char memType = ev.useFineGrain ? 'F' : 'G';
  printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);

+  char sep = (ev.outputToCsv ? ',' : ' ');
+
  for (int i = 0; i < ev.numGpuDevices; i++)
  {
    if (i == srcIdx) continue;
-    printf("   GPU %3d   ", i);
+    printf("   GPU %-3d  %c", i, sep);
  }
  printf("\n");
+  if (!ev.outputToCsv)
+  {
    for (int i = 0; i < ev.numGpuDevices-1; i++)
    {
      printf("-------------");
    }
    printf("\n");
+  }

  for (int p = minGpus; p <= maxGpus; p++)
  {
@@ -2469,11 +2507,12 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
        for (int i = 0; i < ev.numGpuDevices; i++)
        {
          if (bitmask & (1<<i))
-            printf("  %8.3f   ", transfers[counter++].transferBandwidth);
+            printf("  %8.3f  %c", transfers[counter++].transferBandwidth, sep);
          else if (i != srcIdx)
-            printf("             ");
+            printf("            %c", sep);
        }

+        printf(" %d %d", p, numSubExecs);
        for (auto i = 0; i < transfers.size(); i++)
        {
          printf(" (N0 G%d %c%d)", srcIdx, MemTypeStr[transfers[i].dstType[0]], transfers[i].dstIndex[0]);
@@ -2481,9 +2520,7 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
        printf("\n");
      }
    }
-    printf("\n");
  }
-
 }

 void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"

-#define TB_VERSION "1.44"
+#define TB_VERSION "1.45"

 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -75,11 +75,14 @@ public:

  // Environment variables
  int alwaysValidate;    // Validate after each iteration instead of once after all iterations
-  int blockSize;         // Size of each threadblock (must be multiple of 64)
-  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
+  int blockBytes;        // Each subexecutor, except the last, gets a multiple of this many bytes to copy
  int blockOrder;        // How blocks are ordered in single-stream mode (0=Sequential, 1=Interleaved, 2=Random)
  int byteOffset;        // Byte-offset for memory allocations
  int continueOnError;   // Continue tests even after mismatch detected
+  int gfxBlockSize;      // Size of each threadblock (must be multiple of 64)
+  int gfxSingleTeam;     // Team all subExecutors across the data array
+  int gfxUnroll;         // GFX-kernel unroll factor
+  int gfxWaveOrder;      // GFX-kernel wavefront ordering
  int hideEnv;           // Skip printing environment variable
  int numCpuDevices;     // Number of CPU devices to use (defaults to # NUMA nodes detected)
  int numGpuDevices;     // Number of GPU devices to use (defaults to # HIP devices detected)
@@ -123,10 +126,10 @@ public:

  // Enviroment variables only for A2A preset
  int a2aDirect;         // Only execute on links that are directly connected
+  int a2aMode;           // Perform 0=copy, 1=read only, 2 = write only

  // Developer features
  int enableDebug;       // Enable debug output
-  int gpuKernel;         // Which GPU kernel to use

  // Used to track current configuration mode
  ConfigModeEnum configMode;
@@ -167,19 +170,22 @@ public:

    // Different hardware pick different GPU kernels
    // This performance difference is generally only noticable when executing fewer CUs
-    int defaultGpuKernel = 0;
-    if      (archName == "gfx906") defaultGpuKernel = 13;
-    else if (archName == "gfx90a") defaultGpuKernel = 9;
-    else if (archName == "gfx940") defaultGpuKernel = 6;
-    else if (archName == "gfx941") defaultGpuKernel = 6;
-    else if (archName == "gfx942") defaultGpuKernel = 3;
+    int defaultGfxUnroll = 4;
+    if      (archName == "gfx906") defaultGfxUnroll = 13;
+    else if (archName == "gfx90a") defaultGfxUnroll = 9;
+    else if (archName == "gfx940") defaultGfxUnroll = 6;
+    else if (archName == "gfx941") defaultGfxUnroll = 6;
+    else if (archName == "gfx942") defaultGfxUnroll = 4;

    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
-    blockSize         = GetEnvVar("BLOCK_SIZE"          , 256);
    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
    blockOrder        = GetEnvVar("BLOCK_ORDER"         , 0);
    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
    continueOnError   = GetEnvVar("CONTINUE_ON_ERROR"   , 0);
+    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
+    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 0);
+    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
+    gfxWaveOrder      = GetEnvVar("GFX_WAVE_ORDER"      , 0);
    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
    numCpuDevices     = GetEnvVar("NUM_CPU_DEVICES"     , numDetectedCpus);
    numGpuDevices     = GetEnvVar("NUM_GPU_DEVICES"     , numDetectedGpus);
@@ -196,7 +202,6 @@ public:
    useXccFilter      = GetEnvVar("USE_XCC_FILTER"      , 0);
    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
    enableDebug       = GetEnvVar("DEBUG"               , 0);
-    gpuKernel         = GetEnvVar("GPU_KERNEL"          , defaultGpuKernel);

    // P2P Benchmark related
    useDmaCopy        = GetEnvVar("USE_GPU_DMA"         , 0); // Needed for numGpuSubExec
@@ -221,6 +226,7 @@ public:

    // A2A Benchmark related
    a2aDirect         = GetEnvVar("A2A_DIRECT"          , 1);
+    a2aMode           = GetEnvVar("A2A_MODE"            , 0);

    // Determine random seed
    char *sweepSeedStr = getenv("SWEEP_SEED");
@@ -401,14 +407,14 @@ public:
      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
      exit(1);
    }
-    if (blockSize % 64)
+    if (gfxBlockSize % 64)
    {
-      printf("[ERROR] BLOCK_SIZE (%d) must be a multiple of 64\n", blockSize);
+      printf("[ERROR] GFX_BLOCK_SIZE (%d) must be a multiple of 64\n", gfxBlockSize);
      exit(1);
    }
-    if (blockSize > MAX_BLOCKSIZE)
+    if (gfxBlockSize > MAX_BLOCKSIZE)
    {
-      printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", blockSize, MAX_BLOCKSIZE);
+      printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", gfxBlockSize, MAX_BLOCKSIZE);
      exit(1);
    }
    if (byteOffset % sizeof(float))
@@ -494,9 +500,22 @@ public:
        exit(1);
      }
    }
-    if (gpuKernel < 0 || gpuKernel > NUM_GPU_KERNELS)
+
+    if (a2aMode < 0 || a2aMode > 2)
+    {
+      printf("[ERROR] a2aMode must be between 0 and 2\n");
+      exit(1);
+    }
+
+    if (gfxUnroll < 1 || gfxUnroll > MAX_UNROLL)
+    {
+      printf("[ERROR] GFX kernel unroll factor must be between 1 and %d\n", MAX_UNROLL);
+      exit(1);
+    }
+
+    if (gfxWaveOrder < 0 || gfxWaveOrder >= 6)
    {
-      printf("[ERROR] GPU kernel must be between 0 and %d\n", NUM_GPU_KERNELS);
+      printf("[ERROR] GFX wave order must be between 0 and 5\n");
      exit(1);
    }

@@ -533,6 +552,12 @@ public:
      exit(1);
    }

+    if (getenv("GPU_KERNEL"))
+    {
+      printf("[WARN] GPU_KERNEL has been deprecated and replaced by GFX_KERNEL and GFX_UNROLL\n");
+      exit(1);
+    }
+
    char* enableSdma = getenv("HSA_ENABLE_SDMA");
    if (enableSdma && !strcmp(enableSdma, "0"))
    {
@@ -553,6 +578,9 @@ public:
    printf(" CONTINUE_ON_ERROR      - Continue tests even after mismatch detected\n");
    printf(" CU_MASK                - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
    printf(" FILL_PATTERN=STR       - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F).  Must be even number of digits, (byte-level big-endian)\n");
+    printf(" GFX_UNROLL             - Unroll factor for GFX kernel (0=auto), must be less than %d\n", MAX_UNROLL);
+    printf(" GFX_SINGLE_TEAM        - Have subexecutors work together on full array instead of working on individual disjoint subarrays\n");
+    printf(" GFX_WAVE_ORDER         - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
    printf(" HIDE_ENV               - Hide environment variable value listing\n");
    printf(" NUM_CPU_DEVICES=X      - Restrict number of CPUs to X.  May not be greater than # detected NUMA nodes\n");
    printf(" NUM_GPU_DEVICES=X      - Restrict number of GPUs to X.  May not be greater than # detected HIP devices\n");
@@ -589,10 +617,9 @@ public:
    else if (!hideEnv)
      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
    if (hideEnv) return;
+
    PRINT_EV("ALWAYS_VALIDATE", alwaysValidate,
             std::string("Validating after ") + (alwaysValidate ? "each iteration" : "all iterations"));
-    PRINT_EV("BLOCK_SIZE", blockSize,
-             std::string("Threadblock size of " + std::to_string(blockSize)));
    PRINT_EV("BLOCK_BYTES", blockBytes,
             std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
    PRINT_EV("BLOCK_ORDER", blockOrder,
@@ -607,8 +634,20 @@ public:
             (cuMask.size() ? GetCuMaskDesc() : "All"));
    PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
             (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
-    PRINT_EV("GPU_KERNEL", gpuKernel,
-             std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]");
+    PRINT_EV("GFX_BLOCK_SIZE", gfxBlockSize,
+             std::string("Threadblock size of " + std::to_string(gfxBlockSize)));
+    PRINT_EV("GFX_SINGLE_TEAM", gfxSingleTeam,
+             (gfxSingleTeam ? std::string("Combining CUs to work across entire data array") :
+                              std::string("Each CUs operates on its own disjoint subarray")));
+    PRINT_EV("GFX_UNROLL", gfxUnroll,
+             std::string("Using GFX unroll factor of ") + std::to_string(gfxUnroll));
+    PRINT_EV("GFX_WAVE_ORDER", gfxWaveOrder,
+             std::string("Using GFX wave ordering of ") + std::string((gfxWaveOrder == 0 ? "Unroll,Wavefront,CU" :
+                                                                       gfxWaveOrder == 1 ? "Unroll,CU,Wavefront" :
+                                                                       gfxWaveOrder == 2 ? "Wavefront,Unroll,CU" :
+                                                                       gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
+                                                                       gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
+                                                                                           "CU,Wavefront,Unroll")));
    PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
             std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
    PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
@@ -722,6 +761,10 @@ public:
      printf("[AllToAll Related]\n");
    PRINT_EV("A2A_DIRECT", a2aDirect,
             std::string(a2aDirect ? "Only using direct links" : "Full all-to-all"));
+    PRINT_EV("A2A_MODE", a2aMode,
+             std::string(a2aMode == 0 ? "Perform copy" :
+                         a2aMode == 1 ? "Perform read-only" :
+                                        "Perform write-only"));
    PRINT_EV("USE_FINE_GRAIN", useFineGrain,
             std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
    PRINT_EV("USE_REMOTE_READ", useRemoteRead,

--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
@@ -23,12 +23,16 @@ THE SOFTWARE.
 #pragma once

 #define PackedFloat_t   float4
-#define WARP_SIZE       64
 #define MAX_BLOCKSIZE   512
 #define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
 #define MEMSET_CHAR     75
 #define MEMSET_VAL      13323083.0f

+
+#define MAX_WAVEGROUPS  MAX_BLOCKSIZE / warpSize
+#define MAX_UNROLL      8
+#define NUM_WAVEORDERS  6
+
 // Each subExecutor is provided with subarrays to work on
 #define MAX_SRCS 16
 #define MAX_DSTS 16
@@ -42,6 +46,10 @@ struct SubExecParam
  float*    dst[MAX_DSTS];                      // Destination array pointers
  uint32_t  preferredXccId;                     // XCC ID to execute on

+  // Prepared
+  int       teamSize;                           // Index of this sub executor amongst team
+  int       teamIdx;                            // Size of team this sub executor is part of
+
  // Outputs
  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
@@ -130,321 +138,161 @@ template <typename T> __device__ __forceinline__ T      MemsetVal();
 template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
 template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }

-// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
-template <int LOOP1_UNROLL>
-__global__ void __launch_bounds__(MAX_BLOCKSIZE)
-GpuReduceKernel(SubExecParam* params)
+template <int BLOCKSIZE, int UNROLL>
+__global__ void __launch_bounds__(BLOCKSIZE)
+  GpuReduceKernel(SubExecParam* params, int waveOrder)
 {
  int64_t startCycle;
  if (threadIdx.x == 0) startCycle = wall_clock64();

  SubExecParam& p = params[blockIdx.y];

-  // Filter by XCC if desired
-  int xccId;
+  // (Experimental) Filter by XCC if desired
+  int32_t xccId;
  GetXccId(xccId);
  if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;

+  // Collect data information
+  int32_t const  numSrcs  = p.numSrcs;
+  int32_t const  numDsts  = p.numDsts;
+  float4  const* __restrict__ srcFloat4[MAX_SRCS];
+  float4*        __restrict__ dstFloat4[MAX_DSTS];
+  for (int i = 0; i < numSrcs; i++) srcFloat4[i] = (float4*)p.src[i];
+  for (int i = 0; i < numDsts; i++) dstFloat4[i] = (float4*)p.dst[i];
+
  // Operate on wavefront granularity
-  int const numSrcs  = p.numSrcs;
-  int const numDsts  = p.numDsts;
-  int const waveId   = threadIdx.x / WARP_SIZE; // Wavefront number
-  int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
-
-  // 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration
-  // Determine the number of packed floats processed by the first loop
-  size_t       Nrem        = p.N;
-  size_t const loop1Npack  = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
-  size_t const loop1Nelem  = loop1Npack * FLOATS_PER_PACK;
-  size_t const loop1Inc    = blockDim.x * LOOP1_UNROLL;
-  size_t       loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
-
-  while (loop1Offset < loop1Npack)
-  {
-    PackedFloat_t vals[LOOP1_UNROLL] = {};
+  int32_t const nTeams   = p.teamSize;             // Number of threadblocks working together on this subarray
+  int32_t const teamIdx  = p.teamIdx;              // Index of this threadblock within the team
+  int32_t const nWaves   = BLOCKSIZE   / warpSize; // Number of wavefronts within this threadblock
+  int32_t const waveIdx  = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
+  int32_t const tIdx     = threadIdx.x % warpSize; // Thread index within wavefront

-    if (numSrcs == 0)
-    {
-      #pragma unroll
-      for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal<float4>();
-    }
-    else
-    {
-      for (int i = 0; i < numSrcs; ++i)
-      {
-        PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset;
-        #pragma unroll
-        for (int u = 0; u < LOOP1_UNROLL; ++u)
-          vals[u] += *(packedSrc + u * WARP_SIZE);
-      }
-    }
+  size_t  const numFloat4 = p.N / 4;
+  int32_t const nFlt4PerWave = warpSize * 4;

-    for (int i = 0; i < numDsts; ++i)
+  int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
+  switch (waveOrder)
  {
-      PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset;
-      #pragma unroll
-      for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u];
+  case 0: /* U,W,C */ unrlStride = 1; waveStride = UNROLL; teamStride = UNROLL * nWaves;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+  case 1: /* U,C,W */ unrlStride = 1; teamStride = UNROLL; waveStride = UNROLL * nTeams;  teamStride2 = 1;      waveStride2 = nTeams; break;
+  case 2: /* W,U,C */ waveStride = 1; unrlStride = nWaves; teamStride = nWaves * UNROLL;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+  case 3: /* W,C,U */ waveStride = 1; teamStride = nWaves; unrlStride = nWaves * nTeams;  teamStride2 = nWaves; waveStride2 = 1     ; break;
+  case 4: /* C,U,W */ teamStride = 1; unrlStride = nTeams; waveStride = nTeams * UNROLL;  teamStride2 = 1;      waveStride2 = nTeams; break;
+  case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves;  teamStride2 = 1;      waveStride2 = nTeams; break;
  }
-    loop1Offset += loop1Inc;
-  }
-  Nrem -= loop1Nelem;

-  if (Nrem > 0)
-  {
-    // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
-    // NOTE: Using int32_t due to smaller size requirements
-    int32_t const loop2Npack  = Nrem / FLOATS_PER_PACK;
-    int32_t const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
-    int32_t const loop2Inc    = blockDim.x;
-    int32_t       loop2Offset = threadIdx.x;
-
-    while (loop2Offset < loop2Npack)
+  // First loop: Each wavefront in the team works on UNROLL float4s per thread
+  size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
+  size_t const loop1Limit  = numFloat4 / loop1Stride * loop1Stride;
  {
-      PackedFloat_t val;
+    float4 val[UNROLL];
    if (numSrcs == 0)
    {
-        val = MemsetVal<float4>();
-      }
-      else
-      {
-        val = {};
-        for (int i = 0; i < numSrcs; ++i)
-        {
-          PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset;
-          val += *packedSrc;
-        }
+      #pragma unroll
+      for (int u = 0; u < UNROLL; u++)
+        val[u] = MemsetVal<float4>();
    }

-      for (int i = 0; i < numDsts; ++i)
+    for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
    {
-        PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset;
-        *packedDst = val;
-      }
-      loop2Offset += loop2Inc;
+      // Read sources into memory and accumulate in registers
+      if (numSrcs)
+      {
+        for (int u = 0; u < UNROLL; u++)
+          val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
+        for (int s = 1; s < numSrcs; s++)
+          for (int u = 0; u < UNROLL; u++)
+            val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
      }
-    Nrem -= loop2Nelem;

-    // Deal with leftovers less than FLOATS_PER_PACK)
-    if (threadIdx.x < Nrem)
+      // Write accumulation to all outputs
+      for (int d = 0; d < numDsts; d++)
      {
-      int offset = loop1Nelem + loop2Nelem + threadIdx.x;
-      float val = 0;
-      if (numSrcs == 0)
-      {
-        val = MEMSET_VAL;
+        #pragma unroll
+        for (int u = 0; u < UNROLL; u++)
+          dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
      }
-      else
-      {
-        for (int i = 0; i < numSrcs; ++i)
-          val += p.src[i][offset];
-      }
-
-      for (int i = 0; i < numDsts; ++i)
-        p.dst[i][offset] = val;
    }
  }

-  __syncthreads();
-  if (threadIdx.x == 0)
+  // Second loop: Deal with remaining float4s
  {
-    __threadfence_system();
-    p.stopCycle  = wall_clock64();
-    p.startCycle = startCycle;
-    p.xccId      = xccId;
-    __trace_hwreg();
-  }
-}
-
-template <typename FLOAT_TYPE, int UNROLL_FACTOR>
-__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
-{
-  int    constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
-  size_t constexpr loopPackInc      = blockDim.x * UNROLL_FACTOR;
-  size_t constexpr numPacksPerWave  = WARP_SIZE * UNROLL_FACTOR;
-  int    const     waveId           = threadIdx.x / WARP_SIZE;            // Wavefront number
-  int    const     threadId         = threadIdx.x % WARP_SIZE;            // Thread index within wavefront
-  int    const     numSrcs          = p.numSrcs;
-  int    const     numDsts          = p.numDsts;
-  size_t const     numPacksDone     = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave;
-  size_t const     numFloatsLeft    = N - numPacksDone * numFloatsPerPack;
-  size_t           loopPackOffset   = waveId * numPacksPerWave + threadId;
-
-  while (loopPackOffset < numPacksDone)
+    if (loop1Limit < numFloat4)
    {
-    FLOAT_TYPE vals[UNROLL_FACTOR];
+      float4 val;
+      if (numSrcs == 0) val = MemsetVal<float4>();

-    if (numSrcs == 0)
-    {
-      #pragma unroll UNROLL_FACTOR
-      for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal<FLOAT_TYPE>();
-    }
-    else
+      size_t const loop2Stride = nTeams * nWaves * warpSize;
+      for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
      {
-      FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset;
-      #pragma unroll UNROLL_FACTOR
-      for (int u = 0; u < UNROLL_FACTOR; ++u)
-        vals[u] = *(src0Ptr + u * WARP_SIZE);
-
-      for (int i = 1; i < numSrcs; ++i)
+        if (numSrcs)
        {
-        FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset;
-
-        #pragma unroll UNROLL_FACTOR
-        for (int u = 0; u < UNROLL_FACTOR; ++u)
-          vals[u] += *(srcPtr + u * WARP_SIZE);
-      }
+          val = srcFloat4[0][idx];
+          for (int s = 1; s < numSrcs; s++)
+            val += srcFloat4[s][idx];
        }

-    for (int i = 0; i < numDsts; ++i)
-    {
-      FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset;
-      #pragma unroll UNROLL_FACTOR
-      for (int u = 0; u < UNROLL_FACTOR; ++u)
-        *(dstPtr + u * WARP_SIZE) = vals[u];
+        for (int d = 0; d < numDsts; d++)
+          dstFloat4[d][idx] = val;
      }
-    loopPackOffset += loopPackInc;
    }
-
-  return numFloatsLeft;
-}
-
-template <typename FLOAT_TYPE, int UNROLL_FACTOR>
-__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N)
-{
-  // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
-  int    constexpr numFloatsPerRead      = sizeof(FLOAT_TYPE) / sizeof(float);
-  size_t const     numFloatsPerInnerLoop = blockDim.x * numFloatsPerRead;
-  size_t const     numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
-  size_t const     numFloatsLeft         = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
-  size_t const     numFloatsDone         = N - numFloatsLeft;
-  int    const     numSrcs               = p.numSrcs;
-  int    const     numDsts               = p.numDsts;
-
-  for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop)
-  {
-    FLOAT_TYPE tmp[UNROLL_FACTOR];
-
-    if (numSrcs == 0)
-    {
-        #pragma unroll UNROLL_FACTOR
-        for (int u = 0; u < UNROLL_FACTOR; ++u)
-          tmp[u] = MemsetVal<FLOAT_TYPE>();
  }
-    else
-    {
-      #pragma unroll UNROLL_FACTOR
-      for (int u = 0; u < UNROLL_FACTOR; ++u)
-        tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop]));

-      for (int i = 1; i < numSrcs; ++i)
+  // Third loop; Deal with remaining floats
  {
-        #pragma unroll UNROLL_FACTOR
-        for (int u = 0; u < UNROLL_FACTOR; ++u)
-          tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop]));
-      }
-    }
+    if (numFloat4 * 4 < p.N)
+    {
+      float val;
+      if (numSrcs == 0) val = MemsetVal<float>();

-    for (int i = 0; i < numDsts; ++i)
+      size_t const loop3Stride = nTeams * nWaves * warpSize;
+      for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
      {
-      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        if (numSrcs)
        {
-        *((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u];
+          val = p.src[0][idx];
+          for (int s = 1; s < numSrcs; s++)
+            val += p.src[s][idx];
        }
+
+        for (int d = 0; d < numDsts; d++)
+          p.dst[d][idx] = val;
      }
    }
-  return numFloatsLeft;
-}
-
-template <typename FLOAT_TYPE>
-__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
-{
-  switch (unroll)
-  {
-  case  1: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
-  case  2: return GpuReduceFuncImpl<FLOAT_TYPE,  2>(p, offset, N);
-  case  3: return GpuReduceFuncImpl<FLOAT_TYPE,  3>(p, offset, N);
-  case  4: return GpuReduceFuncImpl<FLOAT_TYPE,  4>(p, offset, N);
-  case  5: return GpuReduceFuncImpl<FLOAT_TYPE,  5>(p, offset, N);
-  case  6: return GpuReduceFuncImpl<FLOAT_TYPE,  6>(p, offset, N);
-  case  7: return GpuReduceFuncImpl<FLOAT_TYPE,  7>(p, offset, N);
-  case  8: return GpuReduceFuncImpl<FLOAT_TYPE,  8>(p, offset, N);
-  case  9: return GpuReduceFuncImpl<FLOAT_TYPE,  9>(p, offset, N);
-  case 10: return GpuReduceFuncImpl<FLOAT_TYPE, 10>(p, offset, N);
-  case 11: return GpuReduceFuncImpl<FLOAT_TYPE, 11>(p, offset, N);
-  case 12: return GpuReduceFuncImpl<FLOAT_TYPE, 12>(p, offset, N);
-  case 13: return GpuReduceFuncImpl<FLOAT_TYPE, 13>(p, offset, N);
-  case 14: return GpuReduceFuncImpl<FLOAT_TYPE, 14>(p, offset, N);
-  case 15: return GpuReduceFuncImpl<FLOAT_TYPE, 15>(p, offset, N);
-  case 16: return GpuReduceFuncImpl<FLOAT_TYPE, 16>(p, offset, N);
-  default: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
  }
-}
-
-// GPU copy kernel
-__global__ void __launch_bounds__(MAX_BLOCKSIZE)
-GpuReduceKernel2(SubExecParam* params)
-{
-  int64_t startCycle = wall_clock64();
-  SubExecParam& p = params[blockIdx.y];

-  size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
-  if (numFloatsLeft)
-    numFloatsLeft = GpuReduceFunc<float4>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
-
-  if (numFloatsLeft)
-  GpuReduceFunc<float>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
-
-  __threadfence_system();
+  // Wait for all threads to finish
+  __syncthreads();
  if (threadIdx.x == 0)
  {
-    p.startCycle = startCycle;
+    __threadfence_system();
    p.stopCycle  = wall_clock64();
+    p.startCycle = startCycle;
+    p.xccId      = xccId;
+    __trace_hwreg();
  }
 }

-#define NUM_GPU_KERNELS 18
-typedef void (*GpuKernelFuncPtr)(SubExecParam*);
+typedef void (*GpuKernelFuncPtr)(SubExecParam*, int);

-GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] =
-{
-  GpuReduceKernel<8>,
-  GpuReduceKernel<1>,
-  GpuReduceKernel<2>,
-  GpuReduceKernel<3>,
-  GpuReduceKernel<4>,
-  GpuReduceKernel<5>,
-  GpuReduceKernel<6>,
-  GpuReduceKernel<7>,
-  GpuReduceKernel<8>,
-  GpuReduceKernel<9>,
-  GpuReduceKernel<10>,
-  GpuReduceKernel<11>,
-  GpuReduceKernel<12>,
-  GpuReduceKernel<13>,
-  GpuReduceKernel<14>,
-  GpuReduceKernel<15>,
-  GpuReduceKernel<16>,
-  GpuReduceKernel2
-};
+#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
+  {GpuReduceKernel<BLOCKSIZE, 1>,  \
+   GpuReduceKernel<BLOCKSIZE, 2>,  \
+   GpuReduceKernel<BLOCKSIZE, 3>,  \
+   GpuReduceKernel<BLOCKSIZE, 4>,  \
+   GpuReduceKernel<BLOCKSIZE, 5>,  \
+   GpuReduceKernel<BLOCKSIZE, 6>,  \
+   GpuReduceKernel<BLOCKSIZE, 7>,  \
+   GpuReduceKernel<BLOCKSIZE, 8>}

-std::string GpuKernelNames[NUM_GPU_KERNELS] =
+GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL] =
 {
-  "Default - 8xUnroll",
-  "Unroll x1",
-  "Unroll x2",
-  "Unroll x3",
-  "Unroll x4",
-  "Unroll x5",
-  "Unroll x6",
-  "Unroll x7",
-  "Unroll x8",
-  "Unroll x9",
-  "Unroll x10",
-  "Unroll x11",
-  "Unroll x12",
-  "Unroll x13",
-  "Unroll x14",
-  "Unroll x15",
-  "Unroll x16",
-  "8xUnrollB",
+  GPU_KERNEL_UNROLL_DECL(64),
+  GPU_KERNEL_UNROLL_DECL(128),
+  GPU_KERNEL_UNROLL_DECL(192),
+  GPU_KERNEL_UNROLL_DECL(256),
+  GPU_KERNEL_UNROLL_DECL(320),
+  GPU_KERNEL_UNROLL_DECL(384),
+  GPU_KERNEL_UNROLL_DECL(448),
+  GPU_KERNEL_UNROLL_DECL(512)
 };