Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
abec28e8
Commit
abec28e8
authored
Jan 09, 2026
by
wenjh
Browse files
Merge branch 'develop_v2.10' into release_v2.10
parents
8f468f42
e6f2caf5
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
18 additions
and
1 deletion
+18
-1
tests/cpp/operator/test_normalization.cu
tests/cpp/operator/test_normalization.cu
+7
-0
transformer_engine/common/CMakeLists.txt
transformer_engine/common/CMakeLists.txt
+1
-1
transformer_engine/common/swizzle/swizzle.cu
transformer_engine/common/swizzle/swizzle.cu
+10
-0
No files found.
tests/cpp/operator/test_normalization.cu
View file @
abec28e8
...
...
@@ -43,9 +43,16 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
<<
"in fused norm backward+add"
;
}
#ifdef __HIP_PLATFORM_AMD__
if
(
use_cudnn
)
{
GTEST_SKIP
()
<<
"cuDNN normalizations not supported on pre-Hopper GPUs yet!"
;
return
;
}
#else
if
(
getDeviceComputeCapability
()
<
hopperComputeCapability
&&
use_cudnn
)
{
GTEST_SKIP
()
<<
"cuDNN normalizations not supported on pre-Hopper GPUs yet!"
;
}
#endif
using
WeightType
=
InputType
;
DType
itype
=
TypeInfo
<
InputType
>::
dtype
;
...
...
transformer_engine/common/CMakeLists.txt
View file @
abec28e8
...
...
@@ -519,7 +519,7 @@ else()
string_code_transpose_rtc_cast_transpose_cu
)
make_string_header_from_file
(
transpose/rtc/transpose.hip
string_code_transpose_rtc_transpose_cu
)
make_string_header_from_file
(
transpose/rtc/swap_first_dims.
cu
make_string_header_from_file
(
transpose/rtc/swap_first_dims.
hip
string_code_transpose_rtc_swap_first_dims_cu
)
endif
()
...
...
transformer_engine/common/swizzle/swizzle.cu
View file @
abec28e8
...
...
@@ -21,6 +21,15 @@ namespace {
constexpr
int
MXFP8_BLOCK_SIZE
=
32
;
constexpr
int
NVFP4_BLOCK_SIZE
=
16
;
#ifdef __HIP_PLATFORM_AMD__
constexpr
int
TB_DIM
=
32
;
constexpr
int
NEW_SF_TILE_DIM_K
=
16
;
constexpr
int
N_SF_PER_TD_PER_TILE
=
4
;
// output is in ~K-major interleaved blocks
constexpr
int
NEW_SF_TILE_DIM_K_I32
=
NEW_SF_TILE_DIM_K
/
4
;
constexpr
int
NEW_SF_TILE_DIM_M_I32
=
32
;
#else
constexpr
__device__
__host__
int
TB_DIM
=
32
;
constexpr
__device__
__host__
int
NEW_SF_TILE_DIM_K
=
16
;
constexpr
__device__
__host__
int
N_SF_PER_TD_PER_TILE
=
4
;
...
...
@@ -28,6 +37,7 @@ constexpr __device__ __host__ int N_SF_PER_TD_PER_TILE = 4;
// output is in ~K-major interleaved blocks
constexpr
__device__
__host__
int
NEW_SF_TILE_DIM_K_I32
=
NEW_SF_TILE_DIM_K
/
4
;
constexpr
__device__
__host__
int
NEW_SF_TILE_DIM_M_I32
=
32
;
#endif
template
<
typename
LType
>
__device__
inline
void
regs_shuffle_with_bit_shifts
(
LType
*
regs_vec
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment