Commit 214228c1 authored by wenjh's avatar wenjh
Browse files

Don't compile ptx code


Signed-off-by: wenjh's avatarwenjh <wenjh@sugon.com>
parent 3a0747a9
......@@ -63,7 +63,11 @@ struct FamilySpecific {
template <int Arch, int ArchSpecific, int FamilySpecific, class T, class... U>
constexpr bool is_supported_arch() {
if constexpr (T::template compatible<Arch, ArchSpecific, FamilySpecific>()) {
#ifdef __HIP_PLATFORM_AMD__
return false;
#else
return true;
#endif
} else if constexpr (sizeof...(U) != 0) {
return is_supported_arch<Arch, ArchSpecific, FamilySpecific, U...>();
} else {
......@@ -71,6 +75,13 @@ constexpr bool is_supported_arch() {
}
}
#ifdef __HIP_PLATFORM_AMD__
#define __CUDA_ARCH_HAS_FEATURE__(FEATURE) \
((__CUDA_ARCH__ >= 100 && FEATURE == SM100_ALL) || \
(__CUDA_ARCH__ >= 101 && FEATURE == SM101_ALL) || \
(__CUDA_ARCH__ >= 120 && FEATURE == SM120_ALL))
#endif
#if CUDA_VERSION < 12090
#if __CUDA_ARCH_HAS_FEATURE__(SM90_ALL)
#define __CUDA_ARCH_SPECIFIC__ 900
......@@ -246,14 +257,6 @@ __device__ __forceinline__ void mbarrier_wait_parity(uint64_t *mbar, const uint3
constexpr uint32_t FP32_MANTISSA_BITS = 23;
constexpr uint32_t FP32_EXPONENT_BIAS = 127;
#ifdef __HIP_PLATFORM_AMD__
#define __CUDA_ARCH_HAS_FEATURE__(FEATURE) \
((__CUDA_ARCH__ >= 100 && FEATURE == SM100_ALL) || \
(__CUDA_ARCH__ >= 101 && FEATURE == SM101_ALL) || \
(__CUDA_ARCH__ >= 120 && FEATURE == SM120_ALL))
#endif
__device__ __forceinline__ float exp2f_rcp(e8m0_t biased_exp) {
return (biased_exp == 0) ? 1
: __int_as_float((254 - biased_exp)
......@@ -265,6 +268,9 @@ __device__ __forceinline__ float exp2f(e8m0_t biased_exp) {
}
__device__ __forceinline__ e8m0_t float_to_e8m0(float val) {
#ifdef __HIP_PLATFORM_AMD__
NVTE_DEVICE_ERROR("float_to_e8m0 is not supported on rocm platform.");
#else
constexpr bool is_blackwell = ARCH_BLACKWELL_FAMILY;
if constexpr (is_blackwell) {
uint16_t out;
......@@ -296,6 +302,7 @@ __device__ __forceinline__ e8m0_t float_to_e8m0(float val) {
}
return exponent;
}
#endif
}
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
......@@ -407,6 +414,8 @@ __device__ __forceinline__ void fence_proxy_async_shared_cta() {
#endif // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
}
#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
template <typename T>
struct alignas(2 * sizeof(T)) FPx2 {
T x;
......@@ -457,6 +466,8 @@ static_assert(sizeof(fp16x2) == 4);
static_assert(sizeof(fp8e4m3x2) == 2);
static_assert(sizeof(fp8e5m2x2) == 2);
#if CUDA_VERSION >= 12080
using fp4e2m1 = __nv_fp4_e2m1;
using fp4e2m1x2 = __nv_fp4x2_e2m1;
......@@ -651,6 +662,8 @@ __device__ __forceinline__ void abs_max_2x(fp16x2 &dst, const fp16x2 &p1, const
#endif // (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 890)
}
#endif
} // namespace ptx
namespace {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment