Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
93df3884
Unverified
Commit
93df3884
authored
Nov 04, 2025
by
Weile
Committed by
GitHub
Nov 04, 2025
Browse files
Revert "Add NUM_SRCS/NUM_DSTS template parameters to GpuReduceKernel (#209)"
This reverts commit
44140eeb
.
parent
44140eeb
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
42 deletions
+6
-42
CHANGELOG.md
CHANGELOG.md
+0
-6
src/header/TransferBench.hpp
src/header/TransferBench.hpp
+6
-36
No files found.
CHANGELOG.md
View file @
93df3884
...
...
@@ -8,12 +8,6 @@ Documentation for TransferBench is available at
-
Added warp-level dispatch support via GFX_SE_TYPE environment variable
-
GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock
-
GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp
-
Added compile-time template specialization for numSrcs/numDsts in GpuReduceKernel
-
Instantiates optimized kernels for common Transfer types:
-
Copy (1 src → 1 dst): Optimized single-source data copy
-
Read-only (1 src → 0 dst): Optimized memory read validation
-
Write-only (0 src → 1 dst): Optimized memory write/initialization
-
Compiler eliminates dead code loops for these specialized cases, improving performance by up to 7% for all-to-all workloads on MI3xx machines
## v1.64.00
### Added
...
...
src/header/TransferBench.hpp
View file @
93df3884
...
...
@@ -3015,10 +3015,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
// Kernel for GFX execution
// NUM_SRCS/NUM_DSTS: If 0, use runtime numSrcs/numDsts args; otherwise use template values
template
<
typename
PACKED_FLOAT
,
int
BLOCKSIZE
,
int
UNROLL
,
int
TEMPORAL_MODE
,
int
NUM_SRCS
,
int
NUM_DSTS
>
__device__
void
GpuReduceKernelImpl
(
SubExecParam
*
params
,
int
seType
,
int
warpSize
,
int
waveOrder
,
int
numSubIterations
,
int
numSrcsArg
,
int
numDstsArg
)
template
<
typename
PACKED_FLOAT
,
int
BLOCKSIZE
,
int
UNROLL
,
int
TEMPORAL_MODE
>
__global__
void
__launch_bounds__
(
BLOCKSIZE
)
GpuReduceKernel
(
SubExecParam
*
params
,
int
seType
,
int
warpSize
,
int
waveOrder
,
int
numSubIterations
)
{
int64_t
startCycle
;
// For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
...
...
@@ -3049,9 +3048,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if
(
p
.
preferredXccId
!=
-
1
&&
xccId
!=
p
.
preferredXccId
)
return
;
#endif
//
Use template values if >= 0, otherwise use runtime arguments (NUM_SRCS/NUM_DSTS == -1)
int32_t
const
numSrcs
=
(
NUM_SRCS
>=
0
)
?
NUM_SRCS
:
numSrcs
Arg
;
int32_t
const
numDsts
=
(
NUM_DSTS
>=
0
)
?
NUM_DSTS
:
numDsts
Arg
;
//
Collect data information
int32_t
const
numSrcs
=
p
.
numSrcs
;
int32_t
const
numDsts
=
p
.
numDsts
;
PACKED_FLOAT
const
*
__restrict__
srcFloatPacked
[
MAX_SRCS
];
PACKED_FLOAT
*
__restrict__
dstFloatPacked
[
MAX_DSTS
];
for
(
int
i
=
0
;
i
<
numSrcs
;
i
++
)
srcFloatPacked
[
i
]
=
(
PACKED_FLOAT
const
*
)
p
.
src
[
i
];
...
...
@@ -3189,35 +3188,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
}
// Dispatch wrapper: Selects specialized kernel based on runtime numSrcs/numDsts
template
<
typename
PACKED_FLOAT
,
int
BLOCKSIZE
,
int
UNROLL
,
int
TEMPORAL_MODE
>
__global__
void
__launch_bounds__
(
BLOCKSIZE
)
GpuReduceKernel
(
SubExecParam
*
params
,
int
seType
,
int
warpSize
,
int
waveOrder
,
int
numSubIterations
)
{
// Read numSrcs and numDsts from params
int
const
numSrcs
=
params
[
blockIdx
.
y
].
numSrcs
;
int
const
numDsts
=
params
[
blockIdx
.
y
].
numDsts
;
// Dispatch to specialized implementation for common cases
if
(
numSrcs
==
1
&&
numDsts
==
1
)
{
GpuReduceKernelImpl
<
PACKED_FLOAT
,
BLOCKSIZE
,
UNROLL
,
TEMPORAL_MODE
,
1
,
1
>
(
params
,
seType
,
warpSize
,
waveOrder
,
numSubIterations
,
numSrcs
,
numDsts
);
}
else
if
(
numSrcs
==
0
&&
numDsts
==
1
)
{
GpuReduceKernelImpl
<
PACKED_FLOAT
,
BLOCKSIZE
,
UNROLL
,
TEMPORAL_MODE
,
0
,
1
>
(
params
,
seType
,
warpSize
,
waveOrder
,
numSubIterations
,
numSrcs
,
numDsts
);
}
else
if
(
numSrcs
==
1
&&
numDsts
==
0
)
{
GpuReduceKernelImpl
<
PACKED_FLOAT
,
BLOCKSIZE
,
UNROLL
,
TEMPORAL_MODE
,
1
,
0
>
(
params
,
seType
,
warpSize
,
waveOrder
,
numSubIterations
,
numSrcs
,
numDsts
);
}
else
{
// Fallback: Use (-1,-1) template which uses runtime arguments for any combination
GpuReduceKernelImpl
<
PACKED_FLOAT
,
BLOCKSIZE
,
UNROLL
,
TEMPORAL_MODE
,
-
1
,
-
1
>
(
params
,
seType
,
warpSize
,
waveOrder
,
numSubIterations
,
numSrcs
,
numDsts
);
}
}
#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \
{GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment