Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
3f8d00df
Unverified
Commit
3f8d00df
authored
Nov 12, 2025
by
gilbertlee-amd
Committed by
GitHub
Nov 12, 2025
Browse files
v1.65 Adding warp-subexecutor support via GFX_SE_TYPE (#212)
Co-authored-by:
Weile
<
weile.wei@amd.com
>
parent
42b7d85b
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
111 additions
and
23 deletions
+111
-23
CHANGELOG.md
CHANGELOG.md
+6
-0
CMakeLists.txt
CMakeLists.txt
+1
-1
src/client/EnvVars.hpp
src/client/EnvVars.hpp
+6
-0
src/header/TransferBench.hpp
src/header/TransferBench.hpp
+98
-22
No files found.
CHANGELOG.md
View file @
3f8d00df
...
...
@@ -3,6 +3,12 @@
Documentation for TransferBench is available at
[
https://rocm.docs.amd.com/projects/TransferBench
](
https://rocm.docs.amd.com/projects/TransferBench
)
.
## v1.65.00
### Added
-
Added warp-level dispatch support via GFX_SE_TYPE environment variable
-
GFX_SE_TYPE=0 (default): Threadblock-level dispatch, each subexecutor is a threadblock
-
GFX_SE_TYPE=1: Warp-level dispatch, each subexecutor is a single warp
## v1.64.00
### Added
-
Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes
...
...
CMakeLists.txt
View file @
3f8d00df
...
...
@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE)
message
(
STATUS
"CMAKE_TOOLCHAIN_FILE:
${
CMAKE_TOOLCHAIN_FILE
}
"
)
endif
()
set
(
VERSION_STRING
"1.6
4
.00"
)
set
(
VERSION_STRING
"1.6
5
.00"
)
project
(
TransferBench VERSION
${
VERSION_STRING
}
LANGUAGES CXX
)
## Load CMake modules
...
...
src/client/EnvVars.hpp
View file @
3f8d00df
...
...
@@ -89,6 +89,7 @@ public:
int
gfxBlockSize
;
// Size of each threadblock (must be multiple of 64)
vector
<
uint32_t
>
cuMask
;
// Bit-vector representing the CU mask
vector
<
vector
<
int
>>
prefXccTable
;
// Specifies XCC to use for given exe->dst pair
int
gfxSeType
;
// GFX subexecutor type (0=threadblock, 1=warp)
int
gfxTemporal
;
// Non-temporal load/store mode (0=none, 1=load, 2=store, 3=both)
int
gfxUnroll
;
// GFX-kernel unroll factor
int
useHipEvents
;
// Use HIP events for timing GFX/DMA Executor
...
...
@@ -141,6 +142,7 @@ public:
fillCompress
=
GetEnvVarArray
(
"FILL_COMPRESS"
,
{});
gfxBlockOrder
=
GetEnvVar
(
"GFX_BLOCK_ORDER"
,
0
);
gfxBlockSize
=
GetEnvVar
(
"GFX_BLOCK_SIZE"
,
256
);
gfxSeType
=
GetEnvVar
(
"GFX_SE_TYPE"
,
0
);
gfxSingleTeam
=
GetEnvVar
(
"GFX_SINGLE_TEAM"
,
1
);
gfxTemporal
=
GetEnvVar
(
"GFX_TEMPORAL"
,
0
);
gfxUnroll
=
GetEnvVar
(
"GFX_UNROLL"
,
defaultGfxUnroll
);
...
...
@@ -320,6 +322,7 @@ public:
printf
(
" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits
\n
"
);
printf
(
" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved
\n
"
);
printf
(
" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)
\n
"
);
printf
(
" GFX_SE_TYPE - SubExecutor granularity type (0=threadblock, 1=warp)
\n
"
);
printf
(
" GFX_TEMPORAL - Use of non-temporal loads or stores (0=none 1=loads 2=stores 3=both)
\n
"
);
printf
(
" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d
\n
"
,
TransferBench
::
GetIntAttribute
(
ATR_GFX_MAX_UNROLL
));
printf
(
" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on disjoint subarrays
\n
"
);
...
...
@@ -411,6 +414,8 @@ public:
"Thread block ordering: %s"
,
gfxBlockOrder
==
0
?
"Sequential"
:
"Interleaved"
);
Print
(
"GFX_BLOCK_SIZE"
,
gfxBlockSize
,
"Threadblock size of %d"
,
gfxBlockSize
);
Print
(
"GFX_SE_TYPE"
,
gfxSeType
,
"SubExecutor granularity: %s"
,
gfxSeType
==
0
?
"Threadblock"
:
"Warp"
);
Print
(
"GFX_SINGLE_TEAM"
,
gfxSingleTeam
,
"%s"
,
(
gfxSingleTeam
?
"Combining CUs to work across entire data array"
:
"Each CUs operates on its own disjoint subarray"
));
...
...
@@ -619,6 +624,7 @@ public:
cfg
.
gfx
.
blockSize
=
gfxBlockSize
;
cfg
.
gfx
.
cuMask
=
cuMask
;
cfg
.
gfx
.
prefXccTable
=
prefXccTable
;
cfg
.
gfx
.
seType
=
gfxSeType
;
cfg
.
gfx
.
unrollFactor
=
gfxUnroll
;
cfg
.
gfx
.
temporalMode
=
gfxTemporal
;
cfg
.
gfx
.
useHipEvents
=
useHipEvents
;
...
...
src/header/TransferBench.hpp
View file @
3f8d00df
...
...
@@ -66,7 +66,7 @@ namespace TransferBench
using
std
::
set
;
using
std
::
vector
;
constexpr
char
VERSION
[]
=
"1.6
4
"
;
constexpr
char
VERSION
[]
=
"1.6
5
"
;
/**
* Enumeration of supported Executor types
...
...
@@ -180,6 +180,7 @@ namespace TransferBench
int
blockSize
=
256
;
///< Size of each threadblock (must be multiple of 64)
vector
<
uint32_t
>
cuMask
=
{};
///< Bit-vector representing the CU mask
vector
<
vector
<
int
>>
prefXccTable
=
{};
///< 2D table with preferred XCD to use for a specific [src][dst] GPU device
int
seType
=
0
;
///< SubExecutor granularity type (0=threadblock, 1=warp)
int
temporalMode
=
0
;
///< Non-temporal load/store mode 0=none, 1=load, 2=store, 3=both
int
unrollFactor
=
4
;
///< GFX-kernel unroll factor
int
useHipEvents
=
1
;
///< Use HIP events for timing GFX Executor
...
...
@@ -459,6 +460,7 @@ namespace TransferBench
// Enumerations
#define hipDeviceAttributeClockRate cudaDevAttrClockRate
#define hipDeviceAttributeMultiprocessorCount cudaDevAttrMultiProcessorCount
#define hipDeviceAttributeWarpSize cudaDevAttrWarpSize
#define hipErrorPeerAccessAlreadyEnabled cudaErrorPeerAccessAlreadyEnabled
#define hipFuncCachePreferShared cudaFuncCachePreferShared
#define hipMemcpyDefault cudaMemcpyDefault
...
...
@@ -608,6 +610,38 @@ namespace {
int
constexpr
MEMSET_CHAR
=
75
;
// Value to memset (char)
float
constexpr
MEMSET_VAL
=
13323083.0
f
;
// Value to memset (double)
int
GetWarpSize
(
std
::
vector
<
ErrResult
>*
errors
=
nullptr
)
{
int
warpSize
=
0
;
hipError_t
err
=
hipDeviceGetAttribute
(
&
warpSize
,
hipDeviceAttributeWarpSize
,
0
);
if
(
err
==
hipSuccess
)
{
return
warpSize
;
}
// Query failed, report error and fall back to compile-time default
if
(
errors
)
{
errors
->
push_back
({
ERR_WARN
,
"Failed to query device warp size (hipDeviceGetAttribute error: %d). "
"Falling back to compile-time default"
,
err
});
}
#if defined(__NVCC__)
return
32
;
#else
return
64
;
#endif
}
// Calculate grid Y dimension based on SE_TYPE
int
CalculateGridY
(
int
seType
,
int
blockSize
,
int
numSubExecs
)
{
// Warp-level: each subexecutor is a warp, pack warps into threadblocks
if
(
seType
==
1
)
{
int
warpsPerBlock
=
blockSize
/
GetWarpSize
();
return
(
numSubExecs
+
warpsPerBlock
-
1
)
/
warpsPerBlock
;
}
// Default: Threadblock-level, each subexecutor is a threadblock
return
numSubExecs
;
}
// Parsing-related functions
//========================================================================================
...
...
@@ -1303,11 +1337,17 @@ namespace {
{
// Check total number of subexecutors requested
int
numGpuSubExec
=
GetNumSubExecutors
(
exeDevice
);
// For warp-level dispatch, multiply by warps per threadblock
if
(
cfg
.
gfx
.
seType
==
1
)
{
int
warpsPerBlock
=
cfg
.
gfx
.
blockSize
/
GetWarpSize
(
&
errors
);
numGpuSubExec
*=
warpsPerBlock
;
}
if
(
totalSubExecs
[
exeDevice
]
>
numGpuSubExec
)
errors
.
push_back
({
ERR_WARN
,
"GPU %d requests %d total
CU
s however only %d available. "
"GPU %d requests %d total
%
s however only %d available. "
"Serialization will occur"
,
exeDevice
.
exeIndex
,
totalSubExecs
[
exeDevice
],
numGpuSubExec
});
exeDevice
.
exeIndex
,
totalSubExecs
[
exeDevice
],
cfg
.
gfx
.
seType
==
0
?
"CUs"
:
"warps"
,
numGpuSubExec
});
// Check that if executor subindices are used, all Transfers specify executor subindices
if
(
useSubIndexCount
[
exeDevice
]
>
0
&&
useSubIndexCount
[
exeDevice
]
!=
transferCount
[
exeDevice
])
{
errors
.
push_back
({
ERR_FATAL
,
...
...
@@ -2977,12 +3017,29 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Kernel for GFX execution
template
<
typename
PACKED_FLOAT
,
int
BLOCKSIZE
,
int
UNROLL
,
int
TEMPORAL_MODE
>
__global__
void
__launch_bounds__
(
BLOCKSIZE
)
GpuReduceKernel
(
SubExecParam
*
params
,
int
waveOrder
,
int
numSubIterations
)
GpuReduceKernel
(
SubExecParam
*
params
,
int
seType
,
int
waveOrder
,
int
numSubIterations
)
{
int64_t
startCycle
;
if
(
threadIdx
.
x
==
0
)
startCycle
=
GetTimestamp
();
// For warp-level, each warp's first thread records timing; for threadblock-level, only first thread of block
bool
shouldRecordTiming
=
(
seType
==
1
)
?
(
threadIdx
.
x
%
warpSize
==
0
)
:
(
threadIdx
.
x
==
0
);
if
(
shouldRecordTiming
)
startCycle
=
GetTimestamp
();
// seType: 0=threadblock, 1=warp
int
subExecIdx
;
if
(
seType
==
0
)
{
// Threadblock-level: each threadblock is a subexecutor
subExecIdx
=
blockIdx
.
y
;
}
else
{
// Warp-level: each warp is a subexecutor
int
warpIdx
=
threadIdx
.
x
/
warpSize
;
int
warpsPerBlock
=
BLOCKSIZE
/
warpSize
;
subExecIdx
=
blockIdx
.
y
*
warpsPerBlock
+
warpIdx
;
}
SubExecParam
&
p
=
params
[
subExecIdx
];
SubExecParam
&
p
=
params
[
blockIdx
.
y
];
// For warp-level dispatch, inactive warps should return early
if
(
seType
==
1
&&
p
.
N
==
0
)
return
;
// Filter by XCC
#if !defined(__NVCC__)
...
...
@@ -3002,8 +3059,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Operate on wavefront granularity
int32_t
const
nTeams
=
p
.
teamSize
;
// Number of threadblocks working together on this subarray
int32_t
const
teamIdx
=
p
.
teamIdx
;
// Index of this threadblock within the team
int32_t
const
nWaves
=
BLOCKSIZE
/
warpSize
;
// Number of wavefronts within this threadblock
int32_t
const
waveIdx
=
threadIdx
.
x
/
warpSize
;
// Index of this wavefront within the threadblock
int32_t
nWaves
,
waveIdx
;
if
(
seType
==
0
)
{
// Threadblock-level: all wavefronts in block work together
nWaves
=
BLOCKSIZE
/
warpSize
;
// Number of wavefronts within this threadblock
waveIdx
=
threadIdx
.
x
/
warpSize
;
// Index of this wavefront within the threadblock
}
else
{
// Warp-level: each warp works independently
nWaves
=
1
;
waveIdx
=
0
;
}
int32_t
const
tIdx
=
threadIdx
.
x
%
warpSize
;
// Thread index within wavefront
size_t
const
numPackedFloat
=
p
.
N
/
(
sizeof
(
PACKED_FLOAT
)
/
sizeof
(
float
));
...
...
@@ -3106,8 +3171,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
// Wait for all threads to finish
if
(
seType
==
1
)
{
// For warp-level, sync within warp only
__syncwarp
();
}
else
{
// For threadblock-level, sync all threads
__syncthreads
();
if
(
threadIdx
.
x
==
0
)
{
}
if
(
shouldRecordTiming
)
{
__threadfence_system
();
p
.
stopCycle
=
GetTimestamp
();
p
.
startCycle
=
startCycle
;
...
...
@@ -3137,8 +3209,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)}
// Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size)
typedef
void
(
*
GpuKernelFuncPtr
)(
SubExecParam
*
,
int
,
int
);
// Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size
/ temporal
)
typedef
void
(
*
GpuKernelFuncPtr
)(
SubExecParam
*
,
int
,
int
,
int
);
GpuKernelFuncPtr
GpuKernelTable
[
MAX_WAVEGROUPS
][
MAX_UNROLL
][
3
][
4
]
=
{
GPU_KERNEL_UNROLL_DECL
(
64
),
...
...
@@ -3161,6 +3233,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#undef GPU_KERNEL_UNROLL_DECL
#undef GPU_KERNEL_DWORD_DECL
#undef GPU_KERNEL_TEMPORAL_DECL
#undef GPU_KERNEL_SE_TYPE_DECL
// Execute a single GPU Transfer (when using 1 stream per Transfer)
static
ErrResult
ExecuteGpuTransfer
(
int
const
iteration
,
...
...
@@ -3174,7 +3247,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
auto
cpuStart
=
std
::
chrono
::
high_resolution_clock
::
now
();
int
numSubExecs
=
rss
.
subExecParamCpu
.
size
();
dim3
const
gridSize
(
xccDim
,
numSubExecs
,
1
);
int
gridY
=
CalculateGridY
(
cfg
.
gfx
.
seType
,
cfg
.
gfx
.
blockSize
,
numSubExecs
);
dim3
const
gridSize
(
xccDim
,
gridY
,
1
);
dim3
const
blockSize
(
cfg
.
gfx
.
blockSize
,
1
);
int
wordSizeIdx
=
cfg
.
gfx
.
wordSize
==
1
?
0
:
...
...
@@ -3185,12 +3259,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#if defined(__NVCC__)
if
(
startEvent
!=
NULL
)
ERR_CHECK
(
hipEventRecord
(
startEvent
,
stream
));
gpuKernel
<<<
gridSize
,
blockSize
,
0
,
stream
>>>
(
rss
.
subExecParamGpuPtr
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
gpuKernel
<<<
gridSize
,
blockSize
,
0
,
stream
>>>
(
rss
.
subExecParamGpuPtr
,
cfg
.
gfx
.
seType
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
if
(
stopEvent
!=
NULL
)
ERR_CHECK
(
hipEventRecord
(
stopEvent
,
stream
));
#else
hipExtLaunchKernelGGL
(
gpuKernel
,
gridSize
,
blockSize
,
0
,
stream
,
startEvent
,
stopEvent
,
0
,
rss
.
subExecParamGpuPtr
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
0
,
rss
.
subExecParamGpuPtr
,
cfg
.
gfx
.
seType
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
#endif
ERR_CHECK
(
hipStreamSynchronize
(
stream
));
...
...
@@ -3249,7 +3323,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
else
{
// Combine all the Transfers into a single kernel launch
int
numSubExecs
=
exeInfo
.
totalSubExecs
;
dim3
const
gridSize
(
xccDim
,
numSubExecs
,
1
);
int
gridY
=
CalculateGridY
(
cfg
.
gfx
.
seType
,
cfg
.
gfx
.
blockSize
,
numSubExecs
);
dim3
const
gridSize
(
xccDim
,
gridY
,
1
);
dim3
const
blockSize
(
cfg
.
gfx
.
blockSize
,
1
);
hipStream_t
stream
=
exeInfo
.
streams
[
0
];
...
...
@@ -3261,14 +3336,14 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#if defined(__NVCC__)
if
(
cfg
.
gfx
.
useHipEvents
)
ERR_CHECK
(
hipEventRecord
(
exeInfo
.
startEvents
[
0
],
stream
));
gpuKernel
<<<
gridSize
,
blockSize
,
0
,
stream
>>>
(
exeInfo
.
subExecParamGpu
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
gpuKernel
<<<
gridSize
,
blockSize
,
0
,
stream
>>>
(
exeInfo
.
subExecParamGpu
,
cfg
.
gfx
.
seType
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
if
(
cfg
.
gfx
.
useHipEvents
)
ERR_CHECK
(
hipEventRecord
(
exeInfo
.
stopEvents
[
0
],
stream
));
#else
hipExtLaunchKernelGGL
(
gpuKernel
,
gridSize
,
blockSize
,
0
,
stream
,
cfg
.
gfx
.
useHipEvents
?
exeInfo
.
startEvents
[
0
]
:
NULL
,
cfg
.
gfx
.
useHipEvents
?
exeInfo
.
stopEvents
[
0
]
:
NULL
,
0
,
exeInfo
.
subExecParamGpu
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
exeInfo
.
subExecParamGpu
,
cfg
.
gfx
.
seType
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
#endif
ERR_CHECK
(
hipStreamSynchronize
(
stream
));
}
...
...
@@ -4026,6 +4101,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#undef hipDeviceAttributeClockRate
#undef hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
#undef hipDeviceAttributeMultiprocessorCount
#undef hipDeviceAttributeWarpSize
#undef hipErrorPeerAccessAlreadyEnabled
#undef hipFuncCachePreferShared
#undef hipMemcpyDefault
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment