Revert "Add NUM_SRCS/NUM_DSTS template parameters to GpuReduceKernel (#209)"

This reverts commit 44140eeb.

Revert "Add NUM_SRCS/NUM_DSTS template parameters to GpuReduceKernel (#209)"
This reverts commit 44140eeb.
93df3884 · Weile · GitHub · 44140eeb · 93df3884 · 93df3884
Unverified Commit 93df3884 authored Nov 04, 2025 by Weile Committed by GitHub Nov 04, 2025
Show whitespace changes
Inline Side-by-side

Showing with 6 additions and 42 deletions

CHANGELOG.md CHANGELOG.md +0 -6

src/header/TransferBench.hpp src/header/TransferBench.hpp +6 -36

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,12 +8,6 @@ Documentation for TransferBench is available at
 - Added warp-level dispatch support via GFX_SE_TYPE environment variable
  - GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock
  - GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp
- Added compile-time template specialization for numSrcs/numDsts in GpuReduceKernel
-  - Instantiates optimized kernels for common Transfer types:
-    - Copy (1 src → 1 dst): Optimized single-source data copy
-    - Read-only (1 src → 0 dst): Optimized memory read validation
-    - Write-only (0 src → 1 dst): Optimized memory write/initialization
-  - Compiler eliminates dead code loops for these specialized cases, improving performance by up to 7% for all-to-all workloads on MI3xx machines

 ## v1.64.00
 ### Added

--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
@@ -3015,10 +3015,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
  }

  // Kernel for GFX execution
-  // NUM_SRCS/NUM_DSTS: If 0, use runtime numSrcs/numDsts args; otherwise use template values
-  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE,
-            int NUM_SRCS, int NUM_DSTS>
-  __device__ void GpuReduceKernelImpl(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations, int numSrcsArg, int numDstsArg)
+  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
+  __global__ void __launch_bounds__(BLOCKSIZE)
+    GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations)
  {
    int64_t startCycle;
    // For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
@@ -3049,9 +3048,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
 #endif

-    // Use template values if >= 0, otherwise use runtime arguments (NUM_SRCS/NUM_DSTS == -1)
-    int32_t const numSrcs = (NUM_SRCS >= 0) ? NUM_SRCS : numSrcsArg;
-    int32_t const numDsts = (NUM_DSTS >= 0) ? NUM_DSTS : numDstsArg;
+    // Collect data information
+    int32_t const  numSrcs  = p.numSrcs;
+    int32_t const  numDsts  = p.numDsts;
    PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
    PACKED_FLOAT*       __restrict__ dstFloatPacked[MAX_DSTS];
    for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i];
@@ -3189,35 +3188,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    }
  }

-  // Dispatch wrapper: Selects specialized kernel based on runtime numSrcs/numDsts
-  template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
-  __global__ void __launch_bounds__(BLOCKSIZE)
-    GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations)
-  {
-    // Read numSrcs and numDsts from params
-    int const numSrcs = params[blockIdx.y].numSrcs;
-    int const numDsts = params[blockIdx.y].numDsts;
-    
-    // Dispatch to specialized implementation for common cases
-    if (numSrcs == 1 && numDsts == 1) {
-      GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 1>
-        (params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
-    }
-    else if (numSrcs == 0 && numDsts == 1) {
-      GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 0, 1>
-        (params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
-    }
-    else if (numSrcs == 1 && numDsts == 0) {
-      GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 0>
-        (params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
-    }
-    else {
-      // Fallback: Use (-1,-1) template which uses runtime arguments for any combination
-      GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, -1, -1>
-        (params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
-    }
-  }
-
 #define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD)           \
  {GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>,      \
   GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>,      \