TransferBench v1.11 (#9)

* Adding MIMO support, DMA executor, Null memory type

TransferBench v1.11 (#9)
* Adding MIMO support, DMA executor, Null memory type
cc0e9cb4 · gilbertlee-amd · GitHub · 3b47b874 · cc0e9cb4 · cc0e9cb4
Unverified Commit cc0e9cb4 authored Jan 20, 2023 by gilbertlee-amd Committed by GitHub Jan 20, 2023
8 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench

+## v1.11
+### Added
+- New multi-input / multi-output support (MIMO).  Transfers now can reduce (element-wise summation) multiple input memory arrays
+  and write the sums to multiple outputs
+- New GPU-DMA executor 'D' (uses hipMemcpy for SDMA copies).  Previously this was done using USE_HIP_CALL, but now this allows
+  GPU-GFX kernel to run in parallel with GPU-DMA instead of applying to all GPU executors globally.
+  - GPU-DMA executor can only be used for single-input/single-output Transfers
+  - GPU-DMA executor can only be associated with one SubExecutor
+- Added new "Null" memory type 'N', which represents empty memory. This allows for read-only or write-only Transfers
+- Added new GPU_KERNEL environment variable that allows for switching between various GPU-GFX reduction kernels
+
+### Optimized
+- Slightly improved GPU-GFX kernel performance based on hardware architecture when running with fewer CUs
+
+### Changed
+- Updated the example.cfg file to cover the new features
+- Updated output to support MIMO
+- Changed CUs/CPUs threads naming to SubExecutors for consistency
+- Sweep Preset:
+  - Default sweep preset executors now includes DMA
+- P2P Benchmarks:
+  - Now only works via "p2p".  Removed "p2p_rr", "g2g" and "g2g_rr".
+    - Setting NUM_CPU_DEVICES=0 can be used to only benchmark GPU devices (like "g2g")
+    - New environment variable USE_REMOTE_READ replaces "_rr" presets
+  - New environment variable USE_GPU_DMA=1 replaces USE_HIP_CALL=1 for benchmarking with GPU-DMA Executor
+  - Number of GPU SubExecutors for benchmark can be specified via NUM_GPU_SE
+    - Defaults to all CUs for GPU-GFX, 1 for GPU-DMA
+  - Number of CPU SubExecutors for benchmark can be specified via NUM_CPU_SE
+- Psuedo-random input pattern has been slightly adjusted to have different patterns for each input array within same Transfer
+
+### Removed
+- USE_HIP_CALL has been removed.  Use GPU-DMA executor 'D' or set USE_GPU_DMA=1 for P2P benchmark presets
+  - Currently warning will be issued if USE_HIP_CALL is set to 1 and program will terminate
+- Removed NUM_CPU_PER_TRANSFER - The number of CPU SubExecutors will be whatever is specified for the Transfer
+- Removed USE_MEMSET environment variable.  This can now be done via a Transfer using the null memory type
+
 ## v1.10
 ### Fixed
 - Fix incorrect bandwidth calculation when using single stream mode and per-Transfer data sizes

--- a/EnvVars.hpp
+++ b/EnvVars.hpp
--- a/Kernels.hpp
+++ b/Kernels.hpp
 /*
-Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -22,66 +22,145 @@ THE SOFTWARE.

 #pragma once

+#define PackedFloat_t   float4
 #define WARP_SIZE       64
 #define BLOCKSIZE       256
+#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
+#define MEMSET_CHAR     75
+#define MEMSET_VAL      13323083.0f

-// GPU copy kernel
-__global__ void __launch_bounds__(BLOCKSIZE)
-GpuCopyKernel(BlockParam* blockParams)
+// Each subExecutor is provided with subarrays to work on
+#define MAX_SRCS 16
+#define MAX_DSTS 16
+struct SubExecParam
 {
-  #define PackedFloat_t float4
-  #define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
+  size_t    N;                                  // Number of floats this subExecutor works on
+  int       numSrcs;                            // Number of source arrays
+  int       numDsts;                            // Number of destination arrays
+  float*    src[MAX_SRCS];                      // Source array pointers
+  float*    dst[MAX_DSTS];                      // Destination array pointers
+  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
+  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
+};

-  // Collect the arguments for this threadblock
-  int          Nrem = blockParams[blockIdx.x].N;
-  float const* src  = blockParams[blockIdx.x].src;
-  float*       dst  = blockParams[blockIdx.x].dst;
-  if (threadIdx.x == 0) blockParams[blockIdx.x].startCycle = __builtin_amdgcn_s_memrealtime();
+void CpuReduceKernel(SubExecParam const& p)
+{
+  int const& numSrcs = p.numSrcs;
+  int const& numDsts = p.numDsts;
+
+  if (numSrcs == 0)
+  {
+    for (int i = 0; i < numDsts; ++i)
+      memset((float* __restrict__)p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
+  }
+  else if (numSrcs == 1)
+  {
+    float const* __restrict__ src = p.src[0];
+    for (int i = 0; i < numDsts; ++i)
+    {
+      memcpy((float* __restrict__)p.dst[i], src, p.N * sizeof(float));
+    }
+  }
+  else
+  {
+    for (int j = 0; j < p.N; j++)
+    {
+      float sum = p.src[0][j];
+      for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
+      for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
+    }
+  }
+}
+
+// Helper function for memset
+template <typename T> __device__ __forceinline__ T      MemsetVal();
+template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
+template <>           __device__ __forceinline__ float4 MemsetVal(){ return make_float4(MEMSET_VAL, MEMSET_VAL, MEMSET_VAL, MEMSET_VAL); }
+
+// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
+template <int LOOP1_UNROLL>
+__global__ void __launch_bounds__(BLOCKSIZE)
+GpuReduceKernel(SubExecParam* params)
+{
+  int64_t startCycle = __builtin_amdgcn_s_memrealtime();

  // Operate on wavefront granularity
-  int numWaves = BLOCKSIZE   / WARP_SIZE; // Number of wavefronts per threadblock
-  int waveId   = threadIdx.x / WARP_SIZE; // Wavefront number
-  int threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
+  SubExecParam& p    = params[blockIdx.x];
+  int const numSrcs  = p.numSrcs;
+  int const numDsts  = p.numDsts;
+  int const numWaves = BLOCKSIZE   / WARP_SIZE; // Number of wavefronts per threadblock
+  int const waveId   = threadIdx.x / WARP_SIZE; // Wavefront number
+  int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront

-  #define LOOP1_UNROLL 8
  // 1st loop - each wavefront operates on LOOP1_UNROLL x FLOATS_PER_PACK per thread per iteration
  // Determine the number of packed floats processed by the first loop
-  int const loop1Npack  = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
-  int const loop1Nelem  = loop1Npack * FLOATS_PER_PACK;
-  int const loop1Inc    = BLOCKSIZE * LOOP1_UNROLL;
-  int       loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
+  size_t       Nrem        = p.N;
+  size_t const loop1Npack  = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
+  size_t const loop1Nelem  = loop1Npack * FLOATS_PER_PACK;
+  size_t const loop1Inc    = BLOCKSIZE * LOOP1_UNROLL;
+  size_t       loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;

-  PackedFloat_t const* packedSrc = (PackedFloat_t const*)(src) + loop1Offset;
-  PackedFloat_t*       packedDst = (PackedFloat_t      *)(dst) + loop1Offset;
  while (loop1Offset < loop1Npack)
  {
-    PackedFloat_t vals[LOOP1_UNROLL];
-    #pragma unroll
-    for (int u = 0; u < LOOP1_UNROLL; ++u)
-      vals[u] = *(packedSrc + u * WARP_SIZE);
+    PackedFloat_t vals[LOOP1_UNROLL] = {};

+    if (numSrcs == 0)
+    {
+      #pragma unroll
+      for (int u = 0; u < LOOP1_UNROLL; ++u) vals[u] = MemsetVal<float4>();
+    }
+    else
+    {
+      for (int i = 0; i < numSrcs; ++i)
+      {
+        PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i]) + loop1Offset;
        #pragma unroll
        for (int u = 0; u < LOOP1_UNROLL; ++u)
-      *(packedDst + u * WARP_SIZE) = vals[u];
+          vals[u] += *(packedSrc + u * WARP_SIZE);
+      }
+    }

-    packedSrc   += loop1Inc;
-    packedDst   += loop1Inc;
+    for (int i = 0; i < numDsts; ++i)
+    {
+      PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i]) + loop1Offset;
+      #pragma unroll
+      for (int u = 0; u < LOOP1_UNROLL; ++u) *(packedDst + u * WARP_SIZE) = vals[u];
+    }
    loop1Offset += loop1Inc;
  }
  Nrem -= loop1Nelem;
+
  if (Nrem > 0)
  {
    // 2nd loop - Each thread operates on FLOATS_PER_PACK per iteration
-    int const loop2Npack  = Nrem / FLOATS_PER_PACK;
-    int const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
-    int const loop2Inc    = BLOCKSIZE;
-    int       loop2Offset = threadIdx.x;
+    // NOTE: Using int32_t due to smaller size requirements
+    int32_t const loop2Npack  = Nrem / FLOATS_PER_PACK;
+    int32_t const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
+    int32_t const loop2Inc    = BLOCKSIZE;
+    int32_t       loop2Offset = threadIdx.x;

-    packedSrc = (PackedFloat_t const*)(src + loop1Nelem);
-    packedDst = (PackedFloat_t      *)(dst + loop1Nelem);
    while (loop2Offset < loop2Npack)
    {
-      packedDst[loop2Offset] = packedSrc[loop2Offset];
+      PackedFloat_t val;
+      if (numSrcs == 0)
+      {
+        val = MemsetVal<float4>();
+      }
+      else
+      {
+        val = {};
+        for (int i = 0; i < numSrcs; ++i)
+        {
+          PackedFloat_t const* __restrict__ packedSrc = (PackedFloat_t const*)(p.src[i] + loop1Nelem) + loop2Offset;
+          val += *packedSrc;
+        }
+      }
+
+      for (int i = 0; i < numDsts; ++i)
+      {
+        PackedFloat_t* __restrict__ packedDst = (PackedFloat_t*)(p.dst[i] + loop1Nelem) + loop2Offset;
+        *packedDst = val;
+      }
      loop2Offset += loop2Inc;
    }
    Nrem -= loop2Nelem;
@@ -90,40 +169,221 @@ GpuCopyKernel(BlockParam* blockParams)
    if (threadIdx.x < Nrem)
    {
      int offset = loop1Nelem + loop2Nelem + threadIdx.x;
-      dst[offset] = src[offset];
+      float val = 0;
+      if (numSrcs == 0)
+      {
+        val = MEMSET_VAL;
+      }
+      else
+      {
+        for (int i = 0; i < numSrcs; ++i)
+          val += ((float const* __restrict__)p.src[i])[offset];
+      }
+
+      for (int i = 0; i < numDsts; ++i)
+        ((float* __restrict__)p.dst[i])[offset] = val;
    }
  }

-  __threadfence_system();
+  __syncthreads();
  if (threadIdx.x == 0)
-    blockParams[blockIdx.x].stopCycle = __builtin_amdgcn_s_memrealtime();
+  {
+    p.startCycle = startCycle;
+    p.stopCycle  = __builtin_amdgcn_s_memrealtime();
+  }
 }

-#define MEMSET_UNROLL 8
-__global__ void __launch_bounds__(BLOCKSIZE)
-GpuMemsetKernel(BlockParam* blockParams)
+template <typename FLOAT_TYPE, int UNROLL_FACTOR>
+__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
 {
-  // Collect the arguments for this block
-  int N = blockParams[blockIdx.x].N;
-  float* __restrict__ dst = (float*)blockParams[blockIdx.x].dst;
+  int    constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
+  int    constexpr numWaves         = BLOCKSIZE   / WARP_SIZE;            // Number of wavefronts per threadblock
+  size_t constexpr loopPackInc      = BLOCKSIZE * UNROLL_FACTOR;
+  size_t constexpr numPacksPerWave  = WARP_SIZE * UNROLL_FACTOR;
+  int    const     waveId           = threadIdx.x / WARP_SIZE;            // Wavefront number
+  int    const     threadId         = threadIdx.x % WARP_SIZE;            // Thread index within wavefront
+  int    const     numSrcs          = p.numSrcs;
+  int    const     numDsts          = p.numDsts;
+  size_t const     numPacksDone     = (numFloatsPerPack == 1 && UNROLL_FACTOR == 1) ? N : (N / (FLOATS_PER_PACK * numPacksPerWave)) * numPacksPerWave;
+  size_t const     numFloatsLeft    = N - numPacksDone * numFloatsPerPack;
+  size_t           loopPackOffset   = waveId * numPacksPerWave + threadId;
+
+  while (loopPackOffset < numPacksDone)
+  {
+    FLOAT_TYPE vals[UNROLL_FACTOR];
+
+    if (numSrcs == 0)
+    {
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u) vals[u] = MemsetVal<FLOAT_TYPE>();
+    }
+    else
+    {
+      FLOAT_TYPE const* __restrict__ src0Ptr = ((FLOAT_TYPE const*)(p.src[0] + offset)) + loopPackOffset;
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        vals[u] = *(src0Ptr + u * WARP_SIZE);

-  // Use non-zero value
-  #pragma unroll MEMSET_UNROLL
-  for (int tid = threadIdx.x; tid < N; tid += BLOCKSIZE)
+      for (int i = 1; i < numSrcs; ++i)
      {
-    dst[tid] = 1234.0;
+        FLOAT_TYPE const* __restrict__ srcPtr = ((FLOAT_TYPE const*)(p.src[i] + offset)) + loopPackOffset;
+
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          vals[u] += *(srcPtr + u * WARP_SIZE);
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      FLOAT_TYPE* __restrict__ dstPtr = (FLOAT_TYPE*)(p.dst[i + offset]) + loopPackOffset;
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        *(dstPtr + u * WARP_SIZE) = vals[u];
+    }
+    loopPackOffset += loopPackInc;
  }
+
+  return numFloatsLeft;
 }

-// CPU copy kernel
-void CpuCopyKernel(BlockParam const& blockParams)
+template <typename FLOAT_TYPE, int UNROLL_FACTOR>
+__device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset, size_t const N)
 {
-  memcpy(blockParams.dst, blockParams.src, blockParams.N * sizeof(float));
+  // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
+  int    constexpr numFloatsPerRead      = sizeof(FLOAT_TYPE) / sizeof(float);
+  size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead;
+  size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
+  size_t const     numFloatsLeft         = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
+  size_t const     numFloatsDone         = N - numFloatsLeft;
+  int    const     numSrcs               = p.numSrcs;
+  int    const     numDsts               = p.numDsts;
+
+  for (size_t idx = threadIdx.x * numFloatsPerRead; idx < numFloatsDone; idx += numFloatsPerOuterLoop)
+  {
+    FLOAT_TYPE tmp[UNROLL_FACTOR];
+
+    if (numSrcs == 0)
+    {
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          tmp[u] = MemsetVal<FLOAT_TYPE>();
+    }
+    else
+    {
+      #pragma unroll UNROLL_FACTOR
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+        tmp[u] = *((FLOAT_TYPE*)(&p.src[0][offset + idx + u * numFloatsPerInnerLoop]));
+
+      for (int i = 1; i < numSrcs; ++i)
+      {
+        #pragma unroll UNROLL_FACTOR
+        for (int u = 0; u < UNROLL_FACTOR; ++u)
+          tmp[u] += *((FLOAT_TYPE*)(&p.src[i][offset + idx + u * numFloatsPerInnerLoop]));
+      }
+    }
+
+    for (int i = 0; i < numDsts; ++i)
+    {
+      for (int u = 0; u < UNROLL_FACTOR; ++u)
+      {
+        *((FLOAT_TYPE*)(&p.dst[i][offset + idx + u * numFloatsPerInnerLoop])) = tmp[u];
+      }
+    }
+  }
+  return numFloatsLeft;
+}
+
+template <typename FLOAT_TYPE>
+__device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size_t const N, int const unroll)
+{
+  switch (unroll)
+  {
+  case  1: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
+  case  2: return GpuReduceFuncImpl<FLOAT_TYPE,  2>(p, offset, N);
+  case  3: return GpuReduceFuncImpl<FLOAT_TYPE,  3>(p, offset, N);
+  case  4: return GpuReduceFuncImpl<FLOAT_TYPE,  4>(p, offset, N);
+  case  5: return GpuReduceFuncImpl<FLOAT_TYPE,  5>(p, offset, N);
+  case  6: return GpuReduceFuncImpl<FLOAT_TYPE,  6>(p, offset, N);
+  case  7: return GpuReduceFuncImpl<FLOAT_TYPE,  7>(p, offset, N);
+  case  8: return GpuReduceFuncImpl<FLOAT_TYPE,  8>(p, offset, N);
+  case  9: return GpuReduceFuncImpl<FLOAT_TYPE,  9>(p, offset, N);
+  case 10: return GpuReduceFuncImpl<FLOAT_TYPE, 10>(p, offset, N);
+  case 11: return GpuReduceFuncImpl<FLOAT_TYPE, 11>(p, offset, N);
+  case 12: return GpuReduceFuncImpl<FLOAT_TYPE, 12>(p, offset, N);
+  case 13: return GpuReduceFuncImpl<FLOAT_TYPE, 13>(p, offset, N);
+  case 14: return GpuReduceFuncImpl<FLOAT_TYPE, 14>(p, offset, N);
+  case 15: return GpuReduceFuncImpl<FLOAT_TYPE, 15>(p, offset, N);
+  case 16: return GpuReduceFuncImpl<FLOAT_TYPE, 16>(p, offset, N);
+  default: return GpuReduceFuncImpl<FLOAT_TYPE,  1>(p, offset, N);
+  }
 }

-// CPU memset kernel
-void CpuMemsetKernel(BlockParam const& blockParams)
+// GPU copy kernel
+__global__ void __launch_bounds__(BLOCKSIZE)
+GpuReduceKernel2(SubExecParam* params)
 {
-  for (int i = 0; i < blockParams.N; i++)
-    blockParams.dst[i] = 1234.0;
+  int64_t startCycle = __builtin_amdgcn_s_memrealtime();
+  SubExecParam& p = params[blockIdx.x];
+
+  size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
+  if (numFloatsLeft)
+    numFloatsLeft = GpuReduceFunc<float4>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
+
+  if (numFloatsLeft)
+  GpuReduceFunc<float>(p, p.N - numFloatsLeft, numFloatsLeft, 1);
+
+  __threadfence_system();
+  if (threadIdx.x == 0)
+  {
+    p.startCycle = startCycle;
+    p.stopCycle  = __builtin_amdgcn_s_memrealtime();
+  }
 }
+
+#define NUM_GPU_KERNELS 18
+typedef void (*GpuKernelFuncPtr)(SubExecParam*);
+
+GpuKernelFuncPtr GpuKernelTable[NUM_GPU_KERNELS] =
+{
+  GpuReduceKernel<8>,
+  GpuReduceKernel<1>,
+  GpuReduceKernel<2>,
+  GpuReduceKernel<3>,
+  GpuReduceKernel<4>,
+  GpuReduceKernel<5>,
+  GpuReduceKernel<6>,
+  GpuReduceKernel<7>,
+  GpuReduceKernel<8>,
+  GpuReduceKernel<9>,
+  GpuReduceKernel<10>,
+  GpuReduceKernel<11>,
+  GpuReduceKernel<12>,
+  GpuReduceKernel<13>,
+  GpuReduceKernel<14>,
+  GpuReduceKernel<15>,
+  GpuReduceKernel<16>,
+  GpuReduceKernel2
+};
+
+std::string GpuKernelNames[NUM_GPU_KERNELS] =
+{
+  "Default - 8xUnroll",
+  "Unroll x1",
+  "Unroll x2",
+  "Unroll x3",
+  "Unroll x4",
+  "Unroll x5",
+  "Unroll x6",
+  "Unroll x7",
+  "Unroll x8",
+  "Unroll x9",
+  "Unroll x10",
+  "Unroll x11",
+  "Unroll x12",
+  "Unroll x13",
+  "Unroll x14",
+  "Unroll x15",
+  "Unroll x16",
+  "8xUnrollB",
+};
--- a/LICENSE.md
+++ b/LICENSE.md
-Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

--- a/Makefile
+++ b/Makefile
-# Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 ROCM_PATH ?= /opt/rocm
 HIPCC=$(ROCM_PATH)/bin/hipcc

 EXE=TransferBench
-CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64
+CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64 -ferror-limit=5

 all: $(EXE)


--- a/TransferBench.cpp
+++ b/TransferBench.cpp
--- a/TransferBench.hpp
+++ b/TransferBench.hpp
 /*
-Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -35,20 +35,20 @@ THE SOFTWARE.
 #include <hip/hip_ext.h>
 #include <hsa/hsa_ext_amd.h>

-#include "EnvVars.hpp"
-
 // Helper macro for catching HIP errors
 #define HIP_CALL(cmd)                                                                   \
    do {                                                                                \
        hipError_t error = (cmd);                                                       \
        if (error != hipSuccess)                                                        \
        {                                                                               \
-            std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \
-                      << __LINE__ << " in file " << __FILE__ << "\n";   \
+            std::cerr << "Encountered HIP error (" << hipGetErrorString(error)          \
+                      << ") at line " << __LINE__ << " in file " << __FILE__ << "\n";   \
            exit(-1);                                                                   \
        }                                                                               \
    } while (0)

+#include "EnvVars.hpp"
+
 // Simple configuration parameters
 size_t const DEFAULT_BYTES_PER_TRANSFER = (1<<26);  // Amount of data transferred per Transfer

@@ -59,92 +59,92 @@ typedef enum
  MEM_GPU          = 1, // Coarse-grained global GPU memory
  MEM_CPU_FINE     = 2, // Fine-grained pinned CPU memory
  MEM_GPU_FINE     = 3, // Fine-grained global GPU memory
-  MEM_CPU_UNPINNED = 4 // Unpinned CPU memory
+  MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
+  MEM_NULL         = 5, // NULL memory - used for empty
 } MemType;

-bool IsGpuType(MemType m)
-{
-  return (m == MEM_GPU || m == MEM_GPU_FINE);
-}
-bool IsCpuType(MemType m)
+typedef enum
 {
-  return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED);
-}
+  EXE_CPU          = 0, // CPU executor              (subExecutor = CPU thread)
+  EXE_GPU_GFX      = 1, // GPU kernel-based executor (subExecutor = threadblock/CU)
+  EXE_GPU_DMA      = 2, // GPU SDMA-based executor   (subExecutor = streams)
+} ExeType;
+
+bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); }
+bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
+bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
+bool IsCpuType(ExeType e) { return (e == EXE_CPU); };

-char const MemTypeStr[6] = "CGBFU";
+char const MemTypeStr[7] = "CGBFUN";
+char const ExeTypeStr[4] = "CGD";
+char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};

 MemType inline CharToMemType(char const c)
 {
-  switch (c)
-  {
-  case 'C': return MEM_CPU;
-  case 'G': return MEM_GPU;
-  case 'B': return MEM_CPU_FINE;
-  case 'F': return MEM_GPU_FINE;
-  case 'U': return MEM_CPU_UNPINNED;
-  default:
-    printf("[ERROR] Unexpected mem type (%c)\n", c);
+  char const* val = strchr(MemTypeStr, toupper(c));
+  if (*val) return (MemType)(val - MemTypeStr);
+  printf("[ERROR] Unexpected memory type (%c)\n", c);
  exit(1);
-  }
 }

-typedef enum
-{
-  MODE_FILL  = 0,         // Fill data with pattern
-  MODE_CHECK = 1          // Check data against pattern
-} ModeType;
-
-// Each threadblock copies N floats from src to dst
-struct BlockParam
+ExeType inline CharToExeType(char const c)
 {
-  int       N;
-  float*    src;
-  float*    dst;
-  long long startCycle;
-  long long stopCycle;
-};
+  char const* val = strchr(ExeTypeStr, toupper(c));
+  if (*val) return (ExeType)(val - ExeTypeStr);
+  printf("[ERROR] Unexpected executor type (%c)\n", c);
+  exit(1);
+}

-// Each Transfer is a uni-direction operation from a src memory to dst memory
+// Each Transfer performs reads from source memory location(s), sums them (if multiple sources are specified)
+// then writes the summation to each of the specified destination memory location(s)
 struct Transfer
 {
-  int     transferIndex;       // Transfer identifier
-
-  // Transfer config
-  MemType exeMemType;          // Transfer executor type (CPU or GPU)
+  int                       transferIndex;      // Transfer identifier (within a Test)
+  ExeType                   exeType;            // Transfer executor type
  int                       exeIndex;           // Executor index (NUMA node for CPU / device ID for GPU)
-  MemType srcMemType;          // Source memory type
-  int     srcIndex;            // Source device index
-  MemType dstMemType;          // Destination memory type
-  int     dstIndex;            // Destination device index
-  int     numBlocksToUse;      // Number of threadblocks to use for this Transfer
-  size_t  numBytes;            // Number of bytes to Transfer
-  size_t  numBytesToCopy;      // Number of bytes to copy
-
-  // Memory
-  float*  srcMem;              // Source memory
-  float*  dstMem;              // Destination memory
-
-  // How memory is split across threadblocks / CPU cores
-  std::vector<BlockParam> blockParam;
-  BlockParam* blockParamGpuPtr;
+  int                       numSubExecs;        // Number of subExecutors to use for this Transfer
+  size_t                    numBytes;           // # of bytes requested to Transfer (may be 0 to fallback to default)
+  size_t                    numBytesActual;     // Actual number of bytes to copy
+  double                    transferTime;       // Time taken in milliseconds

-  // Results
-  double  transferTime;
+  int                       numSrcs;            // Number of sources
+  std::vector<MemType>      srcType;            // Source memory types
+  std::vector<int>          srcIndex;           // Source device indice
+  std::vector<float*>       srcMem;             // Source memory

-  // Prepares src memory and how to divide N elements across threadblocks/threads
-  void PrepareBlockParams(EnvVars const& ev, size_t const N);
-};
+  int                       numDsts;            // Number of destinations
+  std::vector<MemType>      dstType;            // Destination memory type
+  std::vector<int>          dstIndex;           // Destination device index
+  std::vector<float*>       dstMem;             // Destination memory
+
+  std::vector<SubExecParam> subExecParam;       // Defines subarrays assigned to each threadblock
+  SubExecParam*             subExecParamGpuPtr; // Pointer to GPU copy of subExecParam

-typedef std::pair<MemType, int> Executor;
+  // Prepares src/dst subarray pointers for each SubExecutor
+  void PrepareSubExecParams(EnvVars const& ev);
+
+  // Prepare source arrays with input data
+  void PrepareSrc(EnvVars const& ev);
+
+  // Validate that destination data contains expected results
+  void ValidateDst(EnvVars const& ev);
+
+  // Prepare reference buffers
+  void PrepareReference(EnvVars const& ev, std::vector<float>& buffer, int bufferIdx);
+
+  // String representation functions
+  std::string SrcToStr() const;
+  std::string DstToStr() const;
+};

 struct ExecutorInfo
 {
  std::vector<Transfer*>   transfers;        // Transfers to execute
  size_t                   totalBytes;       // Total bytes this executor transfers
+  int                      totalSubExecs;    // Total number of subExecutors to use

  // For GPU-Executors
-  int                      totalBlocks;   // Total number of CUs/CPU threads to use
-  BlockParam*              blockParamGpu; // Copy of block parameters in GPU device memory
+  SubExecParam*            subExecParamGpu;  // GPU copy of subExecutor parameters
  std::vector<hipStream_t> streams;
  std::vector<hipEvent_t>  startEvents;
  std::vector<hipEvent_t>  stopEvents;
@@ -153,6 +153,7 @@ struct ExecutorInfo
  double totalTime;
 };

+typedef std::pair<ExeType, int> Executor;
 typedef std::map<Executor, ExecutorInfo> TransferMap;

 // Display usage instructions
@@ -166,7 +167,9 @@ void PopulateTestSizes(size_t const numBytesPerTransfer, int const samplingFacto
                       std::vector<size_t>& valuesofN);

 void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
-                  MemType* memType, int* memIndex);
+                  std::vector<MemType>& memType, std::vector<int>& memIndex);
+void ParseExeType(std::string const& token, int const numCpus, int const numGpus,
+                  ExeType& exeType, int& exeIndex);

 void ParseTransfers(char* line, int numCpus, int numGpus,
                    std::vector<Transfer>& transfers);
@@ -178,26 +181,19 @@ void EnablePeerAccess(int const deviceId, int const peerDeviceId);
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
 void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
 void CheckPages(char* byteArray, size_t numBytes, int targetId);
-void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
 void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
-void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
-void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numBlocksToUse, bool const isRandom);
+void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
+void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);

 // Return the maximum bandwidth measured for given (src/dst) pair
-double GetPeakBandwidth(EnvVars const& ev,
-                        size_t  const  N,
+double GetPeakBandwidth(EnvVars const& ev, size_t  const  N,
                        int     const  isBidirectional,
-                        int     const  readMode,
-                        int     const  numBlocksToUse,
-                        MemType const  srcMemType,
-                        int     const  srcIndex,
-                        MemType const  dstMemType,
-                        int     const  dstIndex);
+                        MemType const  srcType, int const srcIndex,
+                        MemType const  dstType, int const dstIndex);

 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
-std::string GetDesc(MemType srcMemType, int srcIndex,
-                    MemType dstMemType, int dstIndex);
-std::string GetTransferDesc(Transfer const& transfer);
-int RemappedIndex(int const origIdx, MemType const memType);
+
+int RemappedIndex(int const origIdx, bool const isCpuType);
 int GetWallClockRate(int deviceId);
 void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
+std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);
--- a/example.cfg
+++ b/example.cfg
 # ConfigFile Format:
 # ==================
-# A Transfer is defined as a uni-directional copy from src memory location to dst memory location
-# executed by either CPU or GPU
+# A Transfer is defined as a single operation where an Executor reads and adds together
+# values from Source (SRC) memory locations, then writes the sum to destination (DST) memory locations.
+# This simplifies to a simple copy operation when dealing with single SRC/DST.
+#
+#                SRC 0                DST 0
+#                SRC 1 -> Executor -> DST 1
+#                SRC X                DST Y
+
+# Three Executors are supported by TransferBench
+#   Executor:        SubExecutor:
+#   1) CPU           CPU thread
+#   2) GPU           GPU threadblock/Compute Unit (CU)
+#   3) DMA           N/A.                                 (May only be used for copies (single SRC/DST)
+
 # Each single line in the configuration file defines a set of Transfers (a Test) to run in parallel

 # There are two ways to specify a Test:

 # 1) Basic
-#    The basic specification assumes the same number of threadblocks/CUs used per GPU-executed Transfer
+#    The basic specification assumes the same number of SubExecutors (SE) used per Transfer
 #    A positive number of Transfers is specified followed by that number of triplets describing each Transfer

-#    #Transfers #CUs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)
+#    #Transfers #SEs (srcMem1->Executor1->dstMem1) ... (srcMemL->ExecutorL->dstMemL)

 # 2) Advanced
 #    A negative number of Transfers is specified, followed by quintuplets describing each Transfer
 #    A non-zero number of bytes specified will override any provided value
-#    -#Transfers (srcMem1->Executor1->dstMem1 #CUs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #CUsL BytesL)
+#    -#Transfers (srcMem1->Executor1->dstMem1 #SEs1 Bytes1) ... (srcMemL->ExecutorL->dstMemL #SEsL BytesL)

 # Argument Details:
 #   #Transfers:   Number of Transfers to be run in parallel
-#   #CUs      :   Number of threadblocks/CUs to use for a GPU-executed Transfer
-#   srcMemL   :   Source memory location (Where the data is to be read from). Ignored in memset mode
+#   #SEs      :   Number of SubExectors to use (CPU threads/ GPU threadblocks)
+#   srcMemL   :   Source memory locations (Where the data is to be read from)
 #   Executor  :   Executor is specified by a character indicating type, followed by device index (0-indexed)
 #                 - C: CPU-executed  (Indexed from 0 to # NUMA nodes - 1)
 #                 - G: GPU-executed  (Indexed from 0 to # GPUs - 1)
-#   dstMemL   :   Destination memory location (Where the data is to be written to)
+#                 - D: DMA-executor  (Indexed from 0 to # GPUs - 1)
+#   dstMemL   :   Destination memory locations (Where the data is to be written to)
 #   bytesL    :   Number of bytes to copy (0 means use command-line specified size)
 #                 Must be a multiple of 4 and may be suffixed with ('K','M', or 'G')
 #
-#                 Memory locations are specified by a character indicating memory type,
-#                 followed by device index (0-indexed)
+#                 Memory locations are specified by one or more (device character / device index) pairs
+#                 Character indicating memory type followed by device index (0-indexed)
 #                 Supported memory locations are:
 #                 - C:    Pinned host memory       (on NUMA node, indexed from 0 to [# NUMA nodes-1])
 #                 - U:    Unpinned host memory     (on NUMA node, indexed from 0 to [# NUMA nodes-1])
 #                 - B:    Fine-grain host memory   (on NUMA node, indexed from 0 to [# NUMA nodes-1])
 #                 - G:    Global device memory     (on GPU device indexed from 0 to [# GPUs - 1])
 #                 - F:    Fine-grain device memory (on GPU device indexed from 0 to [# GPUs - 1])
+#                 - N:    Null memory              (index ignored)

 # Examples:
 # 1 4 (G0->G0->G1)                   Uses 4 CUs on GPU0 to copy from GPU0 to GPU1
 # 1 4 (C1->G2->G0)                   Uses 4 CUs on GPU2 to copy from CPU1 to GPU0
-# 2 4 G0->G0->G1 G1->G1->G0          Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 CUs
-# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 2 CUs
+# 2 4 G0->G0->G1 G1->G1->G0          Copes from GPU0 to GPU1, and GPU1 to GPU0, each with 4 SEs
+# -2 (G0 G0 G1 4 1M) (G1 G1 G0 2 2M) Copies 1Mb from GPU0 to GPU1 with 4 SEs, and 2Mb from GPU1 to GPU0 with 2 SEs

 # Round brackets and arrows' ->' may be included for human clarity, but will be ignored and are unnecessary
 # Lines starting with # will be ignored. Lines starting with ## will be echoed to output

-# Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
+## Single GPU-executed Transfer between GPUs 0 and 1 using 4 CUs
 1 4 (G0->G0->G1)

-# Copies 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs
+## Single DMA executed Transfer between GPUs 0 and 1
+1 1 (G0->D0->G1)
+
+## Copy 1Mb from GPU0 to GPU1 with 4 CUs, and 2Mb from GPU1 to GPU0 with 8 CUs
 -2 (G0->G0->G1 4 1M) (G1->G1->G0 8 2M)
+
+## "Memset" by GPU 0 to GPU 0 memory
+1 32 (N0->G0->G0)
+
+## "Read-only" by CPU 0
+1 4 (C0->C0->N0)
+
+## Broadcast from GPU 0 to GPU 0 and GPU 1
+1 16 (G0->G0->G0G1)