Add NUM_SRCS/NUM_DSTS template parameters to GpuReduceKernel (#209)

- Instantiates optimized kernels for common Transfer types: - Copy (1 src → 1 dst): Optimized single-source data copy - Read-only (1 src → 0 dst): Optimized memory read validation - Write-only (0 src → 1 dst): Optimized memory write/initialization - Compiler eliminates dead code loops for these specialized cases, improving performance by up to 7% for all-to-all workloads on MI3xx machines - Update CHANGELOG

Add NUM_SRCS/NUM_DSTS template parameters to GpuReduceKernel (#209)
- Instantiates optimized kernels for common Transfer types: - Copy (1 src → 1 dst): Optimized single-source data copy - Read-only (1 src → 0 dst): Optimized memory read validation - Write-only (0 src → 1 dst): Optimized memory write/initialization - Compiler eliminates dead code loops for these specialized cases, improving performance by up to 7% for all-to-all workloads on MI3xx machines - Update CHANGELOG
44140eeb · Weile · GitHub · a73bf6de · 44140eeb · 44140eeb
Unverified Commit 44140eeb authored Oct 31, 2025 by Weile Committed by GitHub Oct 31, 2025
Show whitespace changes
Inline Side-by-side

Showing with 42 additions and 6 deletions

CHANGELOG.md CHANGELOG.md +6 -0

src/header/TransferBench.hpp src/header/TransferBench.hpp +36 -6

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,12 @@ Documentation for TransferBench is available at
 - Added warp-level dispatch support via GFX_SE_TYPE environment variable
  - GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock
  - GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp
+- Added compile-time template specialization for numSrcs/numDsts in GpuReduceKernel
+  - Instantiates optimized kernels for common Transfer types:
+    - Copy (1 src → 1 dst): Optimized single-source data copy
+    - Read-only (1 src → 0 dst): Optimized memory read validation
+    - Write-only (0 src → 1 dst): Optimized memory write/initialization
+  - Compiler eliminates dead code loops for these specialized cases, improving performance by up to 7% for all-to-all workloads on MI3xx machines
 ## v1.64.00
 ### Added

--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -3015,9 +3015,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
  }
  // Kernel for GFX execution
-  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
+  // NUM_SRCS/NUM_DSTS: If 0, use runtime numSrcs/numDsts args; otherwise use template values
-  __global__ void __launch_bounds__(BLOCKSIZE)
+  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE,
-    GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations)
+            int NUM_SRCS, int NUM_DSTS>
+  __device__ void GpuReduceKernelImpl(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations, int numSrcsArg, int numDstsArg)
  {
    int64_t startCycle;
    // For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
@@ -3048,9 +3049,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
 #endif
-    // Collect data information
+    // Use template values if >= 0, otherwise use runtime arguments (NUM_SRCS/NUM_DSTS == -1)
-    int32_t const  numSrcs  = p.numSrcs;
+    int32_t const numSrcs = (NUM_SRCS >= 0) ? NUM_SRCS : numSrcsArg;
-    int32_t const  numDsts  = p.numDsts;
+    int32_t const numDsts = (NUM_DSTS >= 0) ? NUM_DSTS : numDstsArg;
    PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
    PACKED_FLOAT*       __restrict__ dstFloatPacked[MAX_DSTS];
    for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i];
@@ -3188,6 +3189,35 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    }
  }
+  // Dispatch wrapper: Selects specialized kernel based on runtime numSrcs/numDsts
+  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
+  __global__ void __launch_bounds__(BLOCKSIZE)
+    GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations)
+  {
+    // Read numSrcs and numDsts from params
+    int const numSrcs = params[blockIdx.y].numSrcs;
+    int const numDsts = params[blockIdx.y].numDsts;
+    // Dispatch to specialized implementation for common cases
+    if (numSrcs == 1 && numDsts == 1) {
+      GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 1>
+        (params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
+    }
+    else if (numSrcs == 0 && numDsts == 1) {
+      GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 0, 1>
+        (params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
+    }
+    else if (numSrcs == 1 && numDsts == 0) {
+      GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 0>
+        (params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
+    }
+    else {
+      // Fallback: Use (-1,-1) template which uses runtime arguments for any combination
+      GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, -1, -1>
+        (params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
+    }
+  }
 #define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD)           \
  {GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>,      \
   GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>,      \