Don't compile ptx code

Signed-off-by: wenjh <wenjh@sugon.com>

Don't compile ptx code
Signed-off-by: wenjh <wenjh@sugon.com>
214228c1 · wenjh · 3a0747a9 · 214228c1
Commit 214228c1 authored Dec 03, 2025 by wenjh
Hide whitespace changes
Inline Side-by-side

Showing with 21 additions and 8 deletions

transformer_engine/common/util/ptx.cuh transformer_engine/common/util/ptx.cuh +21 -8

No files found.
--- a/transformer_engine/common/util/ptx.cuh
+++ b/transformer_engine/common/util/ptx.cuh
@@ -63,7 +63,11 @@ struct FamilySpecific {
 template <int Arch, int ArchSpecific, int FamilySpecific, class T, class... U>
 constexpr bool is_supported_arch() {
  if constexpr (T::template compatible<Arch, ArchSpecific, FamilySpecific>()) {
+#ifdef __HIP_PLATFORM_AMD__
+    return false;
+#else
    return true;
+#endif
  } else if constexpr (sizeof...(U) != 0) {
    return is_supported_arch<Arch, ArchSpecific, FamilySpecific, U...>();
  } else {
@@ -71,6 +75,13 @@ constexpr bool is_supported_arch() {
  }
 }

+#ifdef __HIP_PLATFORM_AMD__
+#define __CUDA_ARCH_HAS_FEATURE__(FEATURE) \
+    ((__CUDA_ARCH__ >= 100 && FEATURE == SM100_ALL) || \
+     (__CUDA_ARCH__ >= 101 && FEATURE == SM101_ALL) || \
+     (__CUDA_ARCH__ >= 120 && FEATURE == SM120_ALL))
+#endif
+
 #if CUDA_VERSION < 12090
 #if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL)
 #define __CUDA_ARCH_SPECIFIC__ 900
@@ -246,14 +257,6 @@ __device__ __forceinline__ void mbarrier_wait_parity(uint64_t *mbar, const uint3
 constexpr uint32_t FP32_MANTISSA_BITS = 23;
 constexpr uint32_t FP32_EXPONENT_BIAS = 127;

-
-#ifdef __HIP_PLATFORM_AMD__
-#define __CUDA_ARCH_HAS_FEATURE__(FEATURE) \
-    ((__CUDA_ARCH__ >= 100 && FEATURE == SM100_ALL) || \
-     (__CUDA_ARCH__ >= 101 && FEATURE == SM101_ALL) || \
-     (__CUDA_ARCH__ >= 120 && FEATURE == SM120_ALL))
-#endif
-
 __device__ __forceinline__ float exp2f_rcp(e8m0_t biased_exp) {
  return (biased_exp == 0) ? 1
                           : __int_as_float((254 - biased_exp)
@@ -265,6 +268,9 @@ __device__ __forceinline__ float exp2f(e8m0_t biased_exp) {
 }

 __device__ __forceinline__ e8m0_t float_to_e8m0(float val) {
+#ifdef __HIP_PLATFORM_AMD__
+  NVTE_DEVICE_ERROR("float_to_e8m0 is not supported on rocm platform.");
+#else
  constexpr bool is_blackwell = ARCH_BLACKWELL_FAMILY;
  if constexpr (is_blackwell) {
    uint16_t out;
@@ -296,6 +302,7 @@ __device__ __forceinline__ e8m0_t float_to_e8m0(float val) {
    }
    return exponent;
  }
+#endif
 }

 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
@@ -407,6 +414,8 @@ __device__ __forceinline__ void fence_proxy_async_shared_cta() {
 #endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
 }

+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+
 template <typename T>
 struct alignas(2 * sizeof(T)) FPx2 {
  T x;
@@ -457,6 +466,8 @@ static_assert(sizeof(fp16x2) == 4);
 static_assert(sizeof(fp8e4m3x2) == 2);
 static_assert(sizeof(fp8e5m2x2) == 2);

+
+
 #if CUDA_VERSION >= 12080
 using fp4e2m1 = __nv_fp4_e2m1;
 using fp4e2m1x2 = __nv_fp4x2_e2m1;
@@ -651,6 +662,8 @@ __device__ __forceinline__ void abs_max_2x(fp16x2 &dst, const fp16x2 &p1, const
 #endif  // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
 }

+#endif
+
 }  // namespace ptx

 namespace {