Add native conversions

1e3eb1c6 · Rostyslav Geyyer · 8f84a012 · 1e3eb1c6 · 1e3eb1c6
Commit 1e3eb1c6 authored Sep 11, 2023 by Rostyslav Geyyer
Hide whitespace changes
Inline Side-by-side

Showing with 56 additions and 4 deletions

include/ck/utility/f8_utils.hpp include/ck/utility/f8_utils.hpp +3 -0

include/ck/utility/type_convert.hpp include/ck/utility/type_convert.hpp +53 -4

No files found.
--- a/include/ck/utility/f8_utils.hpp
+++ b/include/ck/utility/f8_utils.hpp
@@ -5,6 +5,8 @@

 #include "ck/utility/data_type.hpp"

+// these conversions are disabled if native conversions available
+#if !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__)
 namespace ck {

 // fp8 rounding modes
@@ -248,3 +250,4 @@ __host__ __device__ T cast_from_f8(f8_t x)
 }

 } // namespace ck::utils
+#endif
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -84,40 +84,72 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, int8_t>(int8_
 template <>
 inline __host__ __device__ f8_t type_convert<f8_t, float>(float x)
 {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    union
+    {
+        float fval;
+        uint32_t i32val;
+        uint8_t i8val[4]; // not endian independent
+    } val;
+    val.fval      = x;
+    uint32_t ival = 0;
+    ival       = __builtin_amdgcn_cvt_pk_fp8_f32(val.fval, val.fval, ival, false); // false -> WORD0
+    val.i32val = ival;
+    return val.i8val[0];
+#else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
    return utils::cast_to_f8<float, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
        x, rng);
+#endif
 }

 // convert fp8 to fp32
 template <>
 inline __host__ __device__ float type_convert<float, f8_t>(f8_t x)
 {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    float fval;
+    uint32_t i32val = static_cast<uint32_t>(x);
+    fval            = __builtin_amdgcn_cvt_f32_fp8(i32val, 0);
+    // asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
+    return fval;
+#else
    constexpr bool negative_zero_nan = true;
    return utils::cast_from_f8<float, negative_zero_nan>(x);
+#endif
 }

 // convert fp16 to fp8
 template <>
 inline __host__ __device__ f8_t type_convert<f8_t, half_t>(half_t x)
 {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // convert to float and use native converion
+    return type_convert<f8_t>(type_convert<float>(x));
+#else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::standard;
    constexpr uint32_t rng           = 0;
    return utils::cast_to_f8<half_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
        x, rng);
+#endif
 }

 // convert fp8 to fp16
 template <>
 inline __host__ __device__ half_t type_convert<half_t, f8_t>(f8_t x)
 {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // use native conversion to float and convert to fp16
+    return type_convert<half_t>(type_convert<float>(x));
+#else
    constexpr bool negative_zero_nan = true;
    return utils::cast_from_f8<half_t, negative_zero_nan>(x);
+#endif
 }

 // Declare a template function for bf16 conversion using RTN
@@ -185,28 +217,45 @@ __host__ __device__ constexpr Y f8_convert_sr(X x);
 template <>
 inline __host__ __device__ f8_t f8_convert_sr<f8_t, float>(float x)
 {
+    constexpr int seed = 42;
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    union
+    {
+        float fval;
+        uint32_t i32val;
+        uint8_t i8val[4]; // not endian independent
+    } val;
+    val.fval      = x;
+    uint32_t ival = 0;
+    ival          = __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0); // 0 pos
+    val.i32val    = ival;
+    return val.i8val[0]; // little endian
+#else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
-    constexpr int seed               = 42;
-    // as thread id is not available on host, use 0 for prn generation
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
    return utils::cast_to_f8<float, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
        x, rng);
+#endif
 }

 // convert fp16 to fp8 with stochastic rounding
 template <>
 inline __host__ __device__ f8_t f8_convert_sr<f8_t, half_t>(half_t x)
 {
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // convert to float and use native converion
+    return f8_convert_sr<f8_t>(type_convert<float>(x));
+#else
    constexpr bool negative_zero_nan = true;
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 42;
-    // as thread id is not available on host, use 0 for prn generation
    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
    return utils::cast_to_f8<half_t, negative_zero_nan, clip, (rm == f8_rounding_mode::stochastic)>(
        x, rng);
+#endif
 }

 } // namespace ck