Unverified Commit 93df3884 authored by Weile's avatar Weile Committed by GitHub
Browse files

Revert "Add NUM_SRCS/NUM_DSTS template parameters to GpuReduceKernel (#209)"

This reverts commit 44140eeb.
parent 44140eeb
......@@ -8,12 +8,6 @@ Documentation for TransferBench is available at
- Added warp-level dispatch support via GFX_SE_TYPE environment variable
- GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock
- GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp
- Added compile-time template specialization for numSrcs/numDsts in GpuReduceKernel
- Instantiates optimized kernels for common Transfer types:
- Copy (1 src → 1 dst): Optimized single-source data copy
- Read-only (1 src → 0 dst): Optimized memory read validation
- Write-only (0 src → 1 dst): Optimized memory write/initialization
- Compiler eliminates dead code loops for these specialized cases, improving performance by up to 7% for all-to-all workloads on MI3xx machines
## v1.64.00
### Added
......
......@@ -3015,10 +3015,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
// Kernel for GFX execution
// NUM_SRCS/NUM_DSTS: If 0, use runtime numSrcs/numDsts args; otherwise use template values
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE,
int NUM_SRCS, int NUM_DSTS>
__device__ void GpuReduceKernelImpl(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations, int numSrcsArg, int numDstsArg)
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations)
{
int64_t startCycle;
// For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
......@@ -3049,9 +3048,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
#endif
// Use template values if >= 0, otherwise use runtime arguments (NUM_SRCS/NUM_DSTS == -1)
int32_t const numSrcs = (NUM_SRCS >= 0) ? NUM_SRCS : numSrcsArg;
int32_t const numDsts = (NUM_DSTS >= 0) ? NUM_DSTS : numDstsArg;
// Collect data information
int32_t const numSrcs = p.numSrcs;
int32_t const numDsts = p.numDsts;
PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
PACKED_FLOAT* __restrict__ dstFloatPacked[MAX_DSTS];
for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i];
......@@ -3189,35 +3188,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
}
// Dispatch wrapper: Selects specialized kernel based on runtime numSrcs/numDsts
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations)
{
// Read numSrcs and numDsts from params
int const numSrcs = params[blockIdx.y].numSrcs;
int const numDsts = params[blockIdx.y].numDsts;
// Dispatch to specialized implementation for common cases
if (numSrcs == 1 && numDsts == 1) {
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 1>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
else if (numSrcs == 0 && numDsts == 1) {
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 0, 1>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
else if (numSrcs == 1 && numDsts == 0) {
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 0>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
else {
// Fallback: Use (-1,-1) template which uses runtime arguments for any combination
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, -1, -1>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
}
#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \
{GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment