Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
dbaa02d0
Unverified
Commit
dbaa02d0
authored
Dec 09, 2025
by
Przemyslaw Tredak
Committed by
GitHub
Dec 09, 2025
Browse files
Fix the sm120 compilation with CUDA 12 (#2482)
Signed-off-by:
Przemek Tredak
<
ptredak@nvidia.com
>
parent
e05f87e1
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
13 deletions
+13
-13
transformer_engine/common/util/ptx.cuh
transformer_engine/common/util/ptx.cuh
+13
-13
No files found.
transformer_engine/common/util/ptx.cuh
View file @
dbaa02d0
...
@@ -867,19 +867,19 @@ __device__ __forceinline__ void fma_f32_bf16(float &out, uint16_t const &a, uint
...
@@ -867,19 +867,19 @@ __device__ __forceinline__ void fma_f32_bf16(float &out, uint16_t const &a, uint
}
}
__device__
__forceinline__
void
reduce_sync_max_abs_f32
(
float
&
out
,
float
const
&
in
)
{
__device__
__forceinline__
void
reduce_sync_max_abs_f32
(
float
&
out
,
float
const
&
in
)
{
#if ((__CUDA_ARCH_HAS_FEATURE__(SM100_ALL)) || (__CUDA_ARCH_HAS_FEATURE__(SM101_ALL)) || \
constexpr
bool
is_sm_100f
=
NVTE_CUDA_ARCH_MATCHES
(
ptx
::
FamilySpecific
<
100
>
);
(__CUDA_ARCH_HAS_FEATURE__(SM120_ALL)))
if
constexpr
(
is_sm_100f
)
{
asm
volatile
(
"redux.sync.max.abs.f32 %0, %1, 0xFFFFFFFF;"
:
"=f"
(
out
)
:
"f"
(
in
));
asm
volatile
(
"redux.sync.max.abs.f32 %0, %1, 0xFFFFFFFF;"
:
"=f"
(
out
)
:
"f"
(
in
));
#
else
}
else
{
asm
volatile
(
asm
volatile
(
"{
\n\t
"
"{
\n\t
"
".reg.b32 val;
\n
"
".reg.b32 val;
\n
"
"abs.f32 val, %1;
\n
"
"abs.f32 val, %1;
\n
"
"redux.sync.max.u32 %0, val, 0xFFFFFFFF;
\n
"
"redux.sync.max.u32 %0, val, 0xFFFFFFFF;
\n
"
"}
\n\t
"
"}
\n\t
"
:
"=r"
(
reinterpret_cast
<
uint32_t
&>
(
out
))
:
"=r"
(
reinterpret_cast
<
uint32_t
&>
(
out
))
:
"f"
(
in
));
:
"f"
(
in
));
#endif
}
}
}
__device__
__forceinline__
bf16
get_amax
(
bf16
a
,
bf16
b
)
{
__device__
__forceinline__
bf16
get_amax
(
bf16
a
,
bf16
b
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment