Unverified Commit 44140eeb authored by Weile's avatar Weile Committed by GitHub
Browse files

Add NUM_SRCS/NUM_DSTS template parameters to GpuReduceKernel (#209)

  - Instantiates optimized kernels for common Transfer types:
    - Copy (1 src → 1 dst): Optimized single-source data copy
    - Read-only (1 src → 0 dst): Optimized memory read validation
    - Write-only (0 src → 1 dst): Optimized memory write/initialization
  - Compiler eliminates dead code loops for these specialized cases, improving performance by up to 7% for all-to-all workloads on MI3xx machines
  - Update CHANGELOG
parent a73bf6de
...@@ -8,6 +8,12 @@ Documentation for TransferBench is available at ...@@ -8,6 +8,12 @@ Documentation for TransferBench is available at
- Added warp-level dispatch support via GFX_SE_TYPE environment variable - Added warp-level dispatch support via GFX_SE_TYPE environment variable
- GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock - GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock
- GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp - GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp
- Added compile-time template specialization for numSrcs/numDsts in GpuReduceKernel
- Instantiates optimized kernels for common Transfer types:
- Copy (1 src → 1 dst): Optimized single-source data copy
- Read-only (1 src → 0 dst): Optimized memory read validation
- Write-only (0 src → 1 dst): Optimized memory write/initialization
- Compiler eliminates dead code loops for these specialized cases, improving performance by up to 7% for all-to-all workloads on MI3xx machines
## v1.64.00 ## v1.64.00
### Added ### Added
......
...@@ -3015,9 +3015,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3015,9 +3015,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
// Kernel for GFX execution // Kernel for GFX execution
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE> // NUM_SRCS/NUM_DSTS: If 0, use runtime numSrcs/numDsts args; otherwise use template values
__global__ void __launch_bounds__(BLOCKSIZE) template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE,
GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations) int NUM_SRCS, int NUM_DSTS>
__device__ void GpuReduceKernelImpl(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations, int numSrcsArg, int numDstsArg)
{ {
int64_t startCycle; int64_t startCycle;
// For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block // For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
...@@ -3048,9 +3049,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3048,9 +3049,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (p.preferredXccId != -1 && xccId != p.preferredXccId) return; if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
#endif #endif
// Collect data information // Use template values if >= 0, otherwise use runtime arguments (NUM_SRCS/NUM_DSTS == -1)
int32_t const numSrcs = p.numSrcs; int32_t const numSrcs = (NUM_SRCS >= 0) ? NUM_SRCS : numSrcsArg;
int32_t const numDsts = p.numDsts; int32_t const numDsts = (NUM_DSTS >= 0) ? NUM_DSTS : numDstsArg;
PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS]; PACKED_FLOAT const* __restrict__ srcFloatPacked[MAX_SRCS];
PACKED_FLOAT* __restrict__ dstFloatPacked[MAX_DSTS]; PACKED_FLOAT* __restrict__ dstFloatPacked[MAX_DSTS];
for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i]; for (int i = 0; i < numSrcs; i++) srcFloatPacked[i] = (PACKED_FLOAT const*)p.src[i];
...@@ -3188,6 +3189,35 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3188,6 +3189,35 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
} }
// Dispatch wrapper: Selects specialized kernel based on runtime numSrcs/numDsts
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int seType, int warpSize, int waveOrder, int numSubIterations)
{
// Read numSrcs and numDsts from params
int const numSrcs = params[blockIdx.y].numSrcs;
int const numDsts = params[blockIdx.y].numDsts;
// Dispatch to specialized implementation for common cases
if (numSrcs == 1 && numDsts == 1) {
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 1>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
else if (numSrcs == 0 && numDsts == 1) {
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 0, 1>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
else if (numSrcs == 1 && numDsts == 0) {
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, 1, 0>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
else {
// Fallback: Use (-1,-1) template which uses runtime arguments for any combination
GpuReduceKernelImpl<PACKED_FLOAT, BLOCKSIZE, UNROLL, TEMPORAL_MODE, -1, -1>
(params, seType, warpSize, waveOrder, numSubIterations, numSrcs, numDsts);
}
}
#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \ #define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \
{GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \ {GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \ GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment