Unverified Commit 93df3884 authored by Weile's avatar Weile Committed by GitHub
Browse files

Revert "Add NUM_SRCS/NUM_DSTS template parameters to GpuReduceKernel (#209)"

This reverts commit 44140eeb.
parent 44140eeb
...@@ -8,12 +8,6 @@ Documentation for TransferBench is available at ...@@ -8,12 +8,6 @@ Documentation for TransferBench is available at
- Added warp-level dispatch support via GFX_SE_TYPE environment variable - Added warp-level dispatch support via GFX_SE_TYPE environment variable
- GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock - GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock
- GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp - GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp
- Added compile-time template specialization for numSrcs/numDsts in GpuReduceKernel
- Instantiates optimized kernels for common Transfer types:
- Copy (1 src → 1 dst): Optimized single-source data copy
- Read-only (1 src → 0 dst): Optimized memory read validation
- Write-only (0 src → 1 dst): Optimized memory write/initialization
- Compiler eliminates dead code loops for these specialized cases, improving performance by up to 7% for all-to-all workloads on MI3xx machines
## v1.64.00 ## v1.64.00
### Added ### Added
......
...@@ -3015,10 +3015,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3015,10 +3015,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
// Kernel for GFX execution // Kernel for GFX execution
// NUM_SRCS/NUM_DSTS: If 0, use runtime numSrcs/numDsts args; otherwise use template values template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE, __global__ void __launch_bounds__(BLOCKSIZE)
int NUM_SRCS, int NUM_DSTS> GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations)
__device__ void GpuReduceKernelImpl(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations, int numSrcsArg, int numDstsArg)
{ {
int64_t startCycle; int64_t startCycle;
// For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block // For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
...@@ -3049,9 +3048,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3049,9 +3048,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (p.preferredXccId != -1 && xccId != p.preferredXccId) return; if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
#endif #endif
// Use template values if >= 0, otherwise use runtime arguments (NUM_SRCS/NUM_DSTS == -1) // Collect data information
int32_t const numSrcs = (NUM_SRCS >= 0) ? NUM_SRCS : numSrcsArg; int32_t const numSrcs = p.numSrcs;
int32_t const numDsts = (NUM_DSTS >= 0) ? NUM_DSTS : numDstsArg; int32_t const numDsts = p.numDsts;
PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS]; PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
PACKED_FLOAT* __restrict__ dstFloatPacked[MAX_DSTS]; PACKED_FLOAT* __restrict__ dstFloatPacked[MAX_DSTS];
for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i]; for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i];
...@@ -3189,35 +3188,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3189,35 +3188,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
} }
// Dispatch wrapper: Selects specialized kernel based on runtime numSrcs/numDsts
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations)
{
// Read numSrcs and numDsts from params
int const numSrcs = params[blockIdx.y].numSrcs;
int const numDsts = params[blockIdx.y].numDsts;
// Dispatch to specialized implementation for common cases
if (numSrcs == 1 && numDsts == 1) {
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 1>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
else if (numSrcs == 0 && numDsts == 1) {
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 0, 1>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
else if (numSrcs == 1 && numDsts == 0) {
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 0>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
else {
// Fallback: Use (-1,-1) template which uses runtime arguments for any combination
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, -1, -1>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
}
#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \ #define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \
{GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \ {GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \ GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment