Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
TransformerEngine
Commits
214228c1
Commit
214228c1
authored
Dec 03, 2025
by
wenjh
Browse files
Don't compile ptx code
Signed-off-by:
wenjh
<
wenjh@sugon.com
>
parent
3a0747a9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
21 additions
and
8 deletions
+21
-8
transformer_engine/common/util/ptx.cuh
transformer_engine/common/util/ptx.cuh
+21
-8
No files found.
transformer_engine/common/util/ptx.cuh
View file @
214228c1
...
...
@@ -63,7 +63,11 @@ struct FamilySpecific {
template
<
int
Arch
,
int
ArchSpecific
,
int
FamilySpecific
,
class
T
,
class
...
U
>
constexpr
bool
is_supported_arch
()
{
if
constexpr
(
T
::
template
compatible
<
Arch
,
ArchSpecific
,
FamilySpecific
>())
{
#ifdef __HIP_PLATFORM_AMD__
return
false
;
#else
return
true
;
#endif
}
else
if
constexpr
(
sizeof
...(
U
)
!=
0
)
{
return
is_supported_arch
<
Arch
,
ArchSpecific
,
FamilySpecific
,
U
...
>
();
}
else
{
...
...
@@ -71,6 +75,13 @@ constexpr bool is_supported_arch() {
}
}
#ifdef __HIP_PLATFORM_AMD__
#define __CUDA_ARCH_HAS_FEATURE__(FEATURE) \
((__CUDA_ARCH__ >= 100 && FEATURE == SM100_ALL) || \
(__CUDA_ARCH__ >= 101 && FEATURE == SM101_ALL) || \
(__CUDA_ARCH__ >= 120 && FEATURE == SM120_ALL))
#endif
#if CUDA_VERSION < 12090
#if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL)
#define __CUDA_ARCH_SPECIFIC__ 900
...
...
@@ -246,14 +257,6 @@ __device__ __forceinline__ void mbarrier_wait_parity(uint64_t *mbar, const uint3
constexpr
uint32_t
FP32_MANTISSA_BITS
=
23
;
constexpr
uint32_t
FP32_EXPONENT_BIAS
=
127
;
#ifdef __HIP_PLATFORM_AMD__
#define __CUDA_ARCH_HAS_FEATURE__(FEATURE) \
((__CUDA_ARCH__ >= 100 && FEATURE == SM100_ALL) || \
(__CUDA_ARCH__ >= 101 && FEATURE == SM101_ALL) || \
(__CUDA_ARCH__ >= 120 && FEATURE == SM120_ALL))
#endif
__device__
__forceinline__
float
exp2f_rcp
(
e8m0_t
biased_exp
)
{
return
(
biased_exp
==
0
)
?
1
:
__int_as_float
((
254
-
biased_exp
)
...
...
@@ -265,6 +268,9 @@ __device__ __forceinline__ float exp2f(e8m0_t biased_exp) {
}
__device__
__forceinline__
e8m0_t
float_to_e8m0
(
float
val
)
{
#ifdef __HIP_PLATFORM_AMD__
NVTE_DEVICE_ERROR
(
"float_to_e8m0 is not supported on rocm platform."
);
#else
constexpr
bool
is_blackwell
=
ARCH_BLACKWELL_FAMILY
;
if
constexpr
(
is_blackwell
)
{
uint16_t
out
;
...
...
@@ -296,6 +302,7 @@ __device__ __forceinline__ e8m0_t float_to_e8m0(float val) {
}
return
exponent
;
}
#endif
}
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
...
...
@@ -407,6 +414,8 @@ __device__ __forceinline__ void fence_proxy_async_shared_cta() {
#endif // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
}
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
template
<
typename
T
>
struct
alignas
(
2
*
sizeof
(
T
))
FPx2
{
T
x
;
...
...
@@ -457,6 +466,8 @@ static_assert(sizeof(fp16x2) == 4);
static_assert
(
sizeof
(
fp8e4m3x2
)
==
2
);
static_assert
(
sizeof
(
fp8e5m2x2
)
==
2
);
#if CUDA_VERSION >= 12080
using
fp4e2m1
=
__nv_fp4_e2m1
;
using
fp4e2m1x2
=
__nv_fp4x2_e2m1
;
...
...
@@ -651,6 +662,8 @@ __device__ __forceinline__ void abs_max_2x(fp16x2 &dst, const fp16x2 &p1, const
#endif // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
}
#endif
}
// namespace ptx
namespace
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment