v1.65 Adding warp-subexecutor support via GFX_SE_TYPE (#212)

Co-authored-by: Weile <weile.wei@amd.com>

v1.65 Adding warp-subexecutor support via GFX_SE_TYPE (#212)
Co-authored-by: Weile <weile.wei@amd.com>
3f8d00df · gilbertlee-amd · GitHub · 42b7d85b · 3f8d00df · 3f8d00df
Unverified Commit 3f8d00df authored Nov 12, 2025 by gilbertlee-amd Committed by GitHub Nov 12, 2025
Showing with 111 additions and 23 deletions

CHANGELOG.md CHANGELOG.md +6 -0

CMakeLists.txt CMakeLists.txt +1 -1

src/client/EnvVars.hpp src/client/EnvVars.hpp +6 -0

src/header/TransferBench.hpp src/header/TransferBench.hpp +98 -22

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,12 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
+## v1.65.00
+### Added
+- Added warp-level dispatch support via GFX_SE_TYPE environment variable
+  - GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock
+  - GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp
 ## v1.64.00
 ### Added
 - Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE)
  message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
 endif()
-set(VERSION_STRING "1.64.00")
+set(VERSION_STRING "1.65.00")
 project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
 ## Load CMake modules

--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
@@ -89,6 +89,7 @@ public:
  int gfxBlockSize;                  // Size of each threadblock (must be multiple of 64)
  vector<uint32_t> cuMask;           // Bit-vector representing the CU mask
  vector<vector<int>> prefXccTable;  // Specifies XCC to use for given exe->dst pair
+  int gfxSeType;                     // GFX subexecutor type (0=threadblock, 1=warp)
  int gfxTemporal;                   // Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both)
  int gfxUnroll;                     // GFX-kernel unroll factor
  int useHipEvents;                  // Use HIP events for timing GFX/DMA Executor
@@ -141,6 +142,7 @@ public:
    fillCompress      = GetEnvVarArray("FILL_COMPRESS"  , {});
    gfxBlockOrder     = GetEnvVar("GFX_BLOCK_ORDER"     , 0);
    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
+    gfxSeType         = GetEnvVar("GFX_SE_TYPE"         , 0);
    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
    gfxTemporal       = GetEnvVar("GFX_TEMPORAL"        , 0);
    gfxUnroll         = GetEnvVar("GFX_UNROLL"          , defaultGfxUnroll);
@@ -320,6 +322,7 @@ public:
    printf(" FILL_PATTERN      - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
    printf(" GFX_BLOCK_ORDER   - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
    printf(" GFX_BLOCK_SIZE    - # of threads per threadblock (Must be multiple of 64)\n");
+    printf(" GFX_SE_TYPE       - SubExecutor granularity type (0=threadblock, 1=warp)\n");
    printf(" GFX_TEMPORAL      - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)\n");
    printf(" GFX_UNROLL        - Unroll factor for GFX kernel (0=auto), must be less than %d\n", TransferBench::GetIntAttribute(ATR_GFX_MAX_UNROLL));
    printf(" GFX_SINGLE_TEAM   - Have subexecutors work together on full array instead of working on disjoint subarrays\n");
@@ -411,6 +414,8 @@ public:
          "Thread block ordering: %s", gfxBlockOrder == 0 ? "Sequential" : "Interleaved");
    Print("GFX_BLOCK_SIZE", gfxBlockSize,
          "Threadblock size of %d", gfxBlockSize);
+    Print("GFX_SE_TYPE", gfxSeType,
+          "SubExecutor granularity: %s", gfxSeType == 0 ? "Threadblock" : "Warp");
    Print("GFX_SINGLE_TEAM", gfxSingleTeam,
          "%s", (gfxSingleTeam ? "Combining CUs to work across entire data array" :
                                 "Each CUs operates on its own disjoint subarray"));
@@ -619,6 +624,7 @@ public:
    cfg.gfx.blockSize              = gfxBlockSize;
    cfg.gfx.cuMask                 = cuMask;
    cfg.gfx.prefXccTable           = prefXccTable;
+    cfg.gfx.seType                 = gfxSeType;
    cfg.gfx.unrollFactor           = gfxUnroll;
    cfg.gfx.temporalMode           = gfxTemporal;
    cfg.gfx.useHipEvents           = useHipEvents;

--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -66,7 +66,7 @@ namespace TransferBench
  using std::set;
  using std::vector;
-  constexpr char VERSION[] = "1.64";
+  constexpr char VERSION[] = "1.65";
  /**
   * Enumeration of supported Executor types
@@ -180,6 +180,7 @@ namespace TransferBench
    int                 blockSize      = 256;   ///< Size of each threadblock (must be multiple of 64)
    vector<uint32_t>    cuMask         = {};    ///< Bit-vector representing the CU mask
    vector<vector<int>> prefXccTable   = {};    ///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
+    int                 seType         = 0;     ///< SubExecutor granularity type (0=threadblock, 1=warp)
    int                 temporalMode   = 0;     ///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
    int                 unrollFactor   = 4;     ///< GFX-kernel unroll factor
    int                 useHipEvents   = 1;     ///< Use HIP events for timing GFX Executor
@@ -459,6 +460,7 @@ namespace TransferBench
  // Enumerations
  #define hipDeviceAttributeClockRate                        cudaDevAttrClockRate
  #define hipDeviceAttributeMultiprocessorCount              cudaDevAttrMultiProcessorCount
+  #define hipDeviceAttributeWarpSize                         cudaDevAttrWarpSize
  #define hipErrorPeerAccessAlreadyEnabled                   cudaErrorPeerAccessAlreadyEnabled
  #define hipFuncCachePreferShared                           cudaFuncCachePreferShared
  #define hipMemcpyDefault                                   cudaMemcpyDefault
@@ -608,6 +610,38 @@ namespace {
  int   constexpr MEMSET_CHAR    = 75;                 // Value to memset (char)
  float constexpr MEMSET_VAL     = 13323083.0f;        // Value to memset (double)
+  int GetWarpSize(std::vector<ErrResult>* errors = nullptr) {
+    int warpSize = 0;
+    hipError_t err = hipDeviceGetAttribute(&warpSize, hipDeviceAttributeWarpSize, 0);
+    if (err == hipSuccess) {
+      return warpSize;
+    }
+    // Query failed, report error and fall back to compile-time default
+    if (errors) {
+      errors->push_back({ERR_WARN,
+                        "Failed to query device warp size (hipDeviceGetAttribute error: %d). "
+                        "Falling back to compile-time default", err});
+    }
+#if defined(__NVCC__)
+    return 32;
+#else
+    return 64;
+#endif
+  }
+  // Calculate grid Y dimension based on SE_TYPE
+  int CalculateGridY(int seType, int blockSize, int numSubExecs) {
+    // Warp-level: each subexecutor is a warp, pack warps into threadblocks
+    if (seType == 1) {
+      int warpsPerBlock = blockSize / GetWarpSize();
+      return (numSubExecs + warpsPerBlock - 1) / warpsPerBlock;
+    }
+    // Default: Threadblock-level, each subexecutor is a threadblock
+    return numSubExecs;
+  }
 // Parsing-related functions
 //========================================================================================
@@ -1303,11 +1337,17 @@ namespace {
      {
        // Check total number of subexecutors requested
        int numGpuSubExec = GetNumSubExecutors(exeDevice);
+        // For warp-level dispatch, multiply by warps per threadblock
+        if (cfg.gfx.seType == 1) {
+          int warpsPerBlock = cfg.gfx.blockSize / GetWarpSize(&errors);
+          numGpuSubExec *= warpsPerBlock;
+        }
        if (totalSubExecs[exeDevice] > numGpuSubExec)
          errors.push_back({ERR_WARN,
-                            "GPU %d requests %d total CUs however only %d available. "
+                            "GPU %d requests %d total %s however only %d available. "
                            "Serialization will occur",
-                            exeDevice.exeIndex, totalSubExecs[exeDevice], numGpuSubExec});
+                            exeDevice.exeIndex, totalSubExecs[exeDevice],
+                            cfg.gfx.seType == 0 ? "CUs" : "warps", numGpuSubExec});
        // Check that if executor subindices are used, all Transfers specify executor subindices
        if (useSubIndexCount[exeDevice] > 0 && useSubIndexCount[exeDevice] != transferCount[exeDevice]) {
          errors.push_back({ERR_FATAL,
@@ -2977,12 +3017,29 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
  // Kernel for GFX execution
  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
  __global__ void __launch_bounds__(BLOCKSIZE)
-    GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
+    GpuReduceKernel(SubExecParam* params, int seType, int waveOrder, int numSubIterations)
  {
    int64_t startCycle;
-    if (threadIdx.x == 0) startCycle = GetTimestamp();
+    // For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
+    bool shouldRecordTiming = (seType == 1) ? (threadIdx.x % warpSize == 0) : (threadIdx.x == 0);
+    if (shouldRecordTiming) startCycle = GetTimestamp();
+    // seType: 0=threadblock, 1=warp
+    int subExecIdx;
+    if (seType == 0) {
+      // Threadblock-level: each threadblock is a subexecutor
+      subExecIdx = blockIdx.y;
+    } else {
+      // Warp-level: each warp is a subexecutor
+      int warpIdx = threadIdx.x / warpSize;
+      int warpsPerBlock = BLOCKSIZE / warpSize;
+      subExecIdx = blockIdx.y * warpsPerBlock + warpIdx;
+    }
+    SubExecParam& p = params[subExecIdx];
-    SubExecParam& p = params[blockIdx.y];
+    // For warp-level dispatch, inactive warps should return early
+    if (seType == 1 && p.N == 0) return;
    // Filter by XCC
 #if !defined(__NVCC__)
@@ -3002,8 +3059,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    // Operate on wavefront granularity
    int32_t const nTeams   = p.teamSize;             // Number of threadblocks working together on this subarray
    int32_t const teamIdx  = p.teamIdx;              // Index of this threadblock within the team
-    int32_t const nWaves   = BLOCKSIZE   / warpSize; // Number of wavefronts within this threadblock
+    int32_t nWaves, waveIdx;
-    int32_t const waveIdx  = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
+    if (seType == 0) {
+      // Threadblock-level: all wavefronts in block work together
+      nWaves  = BLOCKSIZE / warpSize;                // Number of wavefronts within this threadblock
+      waveIdx = threadIdx.x / warpSize;              // Index of this wavefront within the threadblock
+    } else {
+      // Warp-level: each warp works independently
+      nWaves  = 1;
+      waveIdx = 0;
+    }
    int32_t const tIdx     = threadIdx.x % warpSize; // Thread index within wavefront
    size_t  const numPackedFloat = p.N / (sizeof(PACKED_FLOAT)/sizeof(float));
@@ -3106,8 +3171,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    }
    // Wait for all threads to finish
+    if (seType == 1) {
+      // For warp-level, sync within warp only
+      __syncwarp();
+    } else {
+      // For threadblock-level, sync all threads
      __syncthreads();
-    if (threadIdx.x == 0) {
+    }
+    if (shouldRecordTiming) {
      __threadfence_system();
      p.stopCycle  = GetTimestamp();
      p.startCycle = startCycle;
@@ -3137,8 +3209,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7),      \
   GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)}
-  // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
+  // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size / temporal)
-  typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
+  typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int, int);
  GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] =
  {
    GPU_KERNEL_UNROLL_DECL(64),
@@ -3161,6 +3233,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
  #undef GPU_KERNEL_UNROLL_DECL
  #undef GPU_KERNEL_DWORD_DECL
  #undef GPU_KERNEL_TEMPORAL_DECL
+  #undef GPU_KERNEL_SE_TYPE_DECL
  // Execute a single GPU Transfer (when using 1 stream per Transfer)
  static ErrResult ExecuteGpuTransfer(int           const  iteration,
@@ -3174,7 +3247,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    auto cpuStart = std::chrono::high_resolution_clock::now();
    int numSubExecs = rss.subExecParamCpu.size();
-    dim3 const gridSize(xccDim, numSubExecs, 1);
+    int gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs);
+    dim3 const gridSize(xccDim, gridY, 1);
    dim3 const blockSize(cfg.gfx.blockSize, 1);
    int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
@@ -3185,12 +3259,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #if defined(__NVCC__)
    if (startEvent != NULL)
      ERR_CHECK(hipEventRecord(startEvent, stream));
-    gpuKernel<<<gridSize, blockSize, 0, stream>>>(rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+    gpuKernel<<<gridSize, blockSize, 0, stream>>>(rss.subExecParamGpuPtr, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
    if (stopEvent != NULL)
      ERR_CHECK(hipEventRecord(stopEvent, stream));
 #else
    hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream, startEvent, stopEvent,
-                          0, rss.subExecParamGpuPtr, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+                          0, rss.subExecParamGpuPtr, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
 #endif
    ERR_CHECK(hipStreamSynchronize(stream));
@@ -3249,7 +3323,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    } else {
      // Combine all the Transfers into a single kernel launch
      int numSubExecs = exeInfo.totalSubExecs;
-      dim3 const gridSize(xccDim, numSubExecs, 1);
+      int gridY = CalculateGridY(cfg.gfx.seType, cfg.gfx.blockSize, numSubExecs);
+      dim3 const gridSize(xccDim, gridY, 1);
      dim3 const blockSize(cfg.gfx.blockSize, 1);
      hipStream_t stream = exeInfo.streams[0];
@@ -3261,14 +3336,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #if defined(__NVCC__)
      if (cfg.gfx.useHipEvents)
        ERR_CHECK(hipEventRecord(exeInfo.startEvents[0], stream));
-      gpuKernel<<<gridSize, blockSize, 0 , stream>>>(exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+      gpuKernel<<<gridSize, blockSize, 0 , stream>>>(exeInfo.subExecParamGpu, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
      if (cfg.gfx.useHipEvents)
        ERR_CHECK(hipEventRecord(exeInfo.stopEvents[0], stream));
 #else
      hipExtLaunchKernelGGL(gpuKernel, gridSize, blockSize, 0, stream,
                            cfg.gfx.useHipEvents ? exeInfo.startEvents[0] : NULL,
                            cfg.gfx.useHipEvents ? exeInfo.stopEvents[0] : NULL, 0,
-                            exeInfo.subExecParamGpu, cfg.gfx.waveOrder, cfg.general.numSubIterations);
+                            exeInfo.subExecParamGpu, cfg.gfx.seType, cfg.gfx.waveOrder, cfg.general.numSubIterations);
 #endif
      ERR_CHECK(hipStreamSynchronize(stream));
    }
@@ -4026,6 +4101,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
 #undef hipDeviceAttributeClockRate
 #undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
 #undef hipDeviceAttributeMultiprocessorCount
+#undef hipDeviceAttributeWarpSize
 #undef hipErrorPeerAccessAlreadyEnabled
 #undef hipFuncCachePreferShared
 #undef hipMemcpyDefault