FP8 GPU implementation (#2455)

eafd55de · Umang Yadav · GitHub · 785ff7d7 · eafd55de · eafd55de
Unverified Commit eafd55de authored Dec 01, 2023 by Umang Yadav Committed by GitHub Dec 01, 2023
20 changed files
--- a/src/include/migraphx/bit_cast.hpp
+++ b/src/include/migraphx/bit_cast.hpp
@@ -21,10 +21,13 @@
 * ************************************************************************ */
 #ifndef MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
 #define MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
+#include <type_traits>
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #endif
+#include <migraphx/requires.hpp>
 #include <migraphx/config.hpp>
 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
@@ -32,7 +35,10 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-template <typename To, typename From>
+template <typename To,
+          typename From,
+          MIGRAPHX_REQUIRES(std::is_trivially_copyable<To>{} and
+                            std::is_trivially_copyable<From>{})>
 inline constexpr To bit_cast(From fr) noexcept
 {
    static_assert(sizeof(To) == sizeof(From));

--- a/src/targets/cpu/dnnl.cpp
+++ b/src/targets/cpu/dnnl.cpp
@@ -68,6 +68,7 @@ dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
    case st::int32_type: return dt::s32;
    case st::int8_type: return dt::s8;
    case st::uint8_type: return dt::u8;
+    case st::fp8e4m3fnuz_type: MIGRAPHX_THROW("fp8e4m3fnuz unsupported in DNNL");
    default: MIGRAPHX_THROW("Unsupported data type");
    }
 }

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -340,7 +340,6 @@ struct cpu_apply
                              {"reduce_min", "reduction_min"},
                              {"reduce_sum", "reduction_sum"},
                          });
        extend_op("concat", "dnnl::concat");
        extend_op("contiguous", "dnnl::reorder");
        extend_op("convolution", "dnnl::convolution");
@@ -376,6 +375,12 @@ struct cpu_apply
        // Apply these operators first so the inputs can be const folded
        for(auto it : iterator_for(*modl))
        {
+            // skip lowering if input has fp8 as one of the inputs since oneDNN doesn't have fp8
+            // supported yet.
+            if(std::any_of(it->inputs().begin(), it->inputs().end(), [](const auto& i) {
+                   return i->get_shape().type() == migraphx::shape::fp8e4m3fnuz_type;
+               }))
+                continue;
            if(it->name() == "pow")
            {
                apply_pow(it);
@@ -383,6 +388,12 @@ struct cpu_apply
        }
        for(auto it : iterator_for(*modl))
        {
+            // skip lowering if input has fp8 as one of the inputs since oneDNN doesn't have fp8
+            // supported yet.
+            if(std::any_of(it->inputs().begin(), it->inputs().end(), [](const auto& i) {
+                   return i->get_shape().type() == migraphx::shape::fp8e4m3fnuz_type;
+               }))
+                continue;
            if(it->name() == "pooling")
            {
                apply_pooling(it);

--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -54,6 +54,11 @@ vectorize vectorize::elements(std::size_t axis,
                              const std::vector<shape>& inputs,
                              const std::vector<std::size_t>& sizes)
 {
+    // disable vectorization for fp8 types
+    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
+           return ishape.type() == migraphx::shape::fp8e4m3fnuz_type;
+       }))
+        return {1, axis};
    if(std::all_of(
           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
        return {1, axis};
@@ -86,6 +91,11 @@ vectorize vectorize::elements(std::size_t axis,
 vectorize vectorize::elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs)
 {
+    // disable vectorization for fp8 types
+    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
+           return ishape.type() == migraphx::shape::fp8e4m3fnuz_type;
+       }))
+        return {1, axis};
    if(inputs.empty())
        return {1, axis};
    std::size_t n = std::max_element(inputs.begin(),

--- a/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+#ifndef MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+#define MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+#include <migraphx/kernels/type_traits.hpp>
+namespace migraphx {
+template <typename To,
+          typename From,
+          MIGRAPHX_REQUIRES(is_trivially_copyable<To>{} and is_trivially_copyable<From>{})>
+inline constexpr To bit_cast(From fr) noexcept
+{
+    static_assert(sizeof(To) == sizeof(From));
+    return __builtin_bit_cast(To, fr);
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+#ifndef MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#define MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#include <migraphx/kernels/bit_cast.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+namespace migraphx {
+namespace fp8 {
+namespace impl {
+// NOLINTBEGIN
+template <int Wm, int We, typename T, bool NegativeZeroNan, bool Clip>
+__device__ constexpr uint8_t cast_to_f8(T f_x, bool stoch = false, uint32_t rng = 0)
+{
+    constexpr bool is_float = true;
+    // half is not supported for now
+    constexpr bool is_half = false;
+    static_assert(Wm + We == 7, "Wm+We==7");
+    static_assert(is_float or is_half, "Only float can be cast to f8");
+    const uint32_t mfmt = (sizeof(T) == 4) ? 23 : 10;
+    typename migraphx::conditional_t<sizeof(T) == 2, uint16_t, uint32_t> x;
+    if constexpr(sizeof(T) == 4)
+        x = migraphx::bit_cast<uint32_t>(f_x);
+    else
+        x = migraphx::bit_cast<uint16_t>(f_x);
+    uint32_t head     = 0;
+    uint32_t mantissa = 0;
+    int exponent      = 0;
+    uint32_t bias     = 0;
+    uint32_t sign     = 0;
+    if constexpr(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+    }
+    uint32_t signed_inf      = (sign << 7) + (((1 << We) - 1) << Wm);
+    uint32_t signed_all_ones = (sign << 7) + ((((1 << We) - 1) << Wm) + ((1 << Wm) - 1));
+    // Calcualte maximum singed value FLT_MAX, FLT_MIN
+    uint32_t signed_max = signed_all_ones;
+    if(not NegativeZeroNan)
+        signed_max = (Wm == 2) ? (signed_max - 4) : (signed_max - 1);
+    // Deal with inf and NaNs
+    if(NegativeZeroNan) // For the FNUZ cases, it is simple just return NaNs
+    {
+        if((sizeof(T) == 4 and ((x & 0x7F800000) == 0x7F800000)) or
+           (sizeof(T) == 2 and ((x & 0x7C00) == 0x7C00)))
+            return 0x80;
+    }
+    else
+    {
+        // calculate most common NaN mantissa for FP8, which is all Ones in binary
+        uint32_t nan_mantissa = 1;
+        for(auto i = 1; i < Wm; ++i)
+        {
+            nan_mantissa |= (nan_mantissa << 1);
+        }
+        if((sizeof(T) == 4 and ((x & 0x7F800000) == 0x7F800000)) or
+           (sizeof(T) == 2 and ((x & 0x7C00) == 0x7C00)))
+        {
+            // infinity
+            if(mantissa == 0)
+            {
+                if(sign == 0)
+                    return (Wm == 2) ? 0x7B : 0x7E;
+                else
+                    return (Wm == 2) ? 0xFB : 0xFE;
+            }
+            else // NaNs
+                return signed_inf + nan_mantissa;
+        }
+    }
+    // handle positive zero
+    if(x == 0)
+        return 0;
+    // handle negative zero
+    else if((sizeof(T) == 4 and x == 0x80000000) or (sizeof(T) == 2 and x == 0x8000))
+    {
+        return NegativeZeroNan ? 0 : 0x80; // For FNUZ types neg zero is just positive zero
+    }
+    /* First need to check if it is normal or denorm as there is a difference of implict 1
+    Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
+    The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
+    RNE, no need to add rng. Then probably need to check whether there is carry and adjust
+    exponent and mantissa again*/
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
+    const int f8_bias                  = (1 << (We - 1u)) - 1 + (NegativeZeroNan ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    /* act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    f8_exponent is the converted f8 exponent with bias encoding
+    exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    the difference needs to be adjusted and mantissa shifted*/
+    int act_exponent  = 0;
+    int f8_exponent   = 0;
+    int exponent_diff = 0;
+    if(exponent == 0 and mantissa != 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
+        here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal
+        has exponent bias 15 while bf8 with FNUZ has exponent bias 16. It means that there are some
+        numbers in fp16 denormal but they are bf8 (FNUZ) normals - smallest bf8 (FNUZ) normal is
+        2^-15. fp16 numbers where exponent==0 (actual exponent -14) and highest bit of mantissa is 1
+        are bf8 (FNUZ) normal. In this case, the fp16 mantissa should be shift left by 1  */
+        act_exponent  = 1 - bias;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
+            For example fp8 FNUZ mode, denormal exponent is -7, but if the fp32/fp16
+            actual exponent is -7, it is actually larger due to the implict 1,
+            Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+            So for fp32/fp16, exponent -8 is the cut point to convert to fp8 FNUZ */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        {          // both fp32/fp16 and f8 are in normal range
+            exponent_diff =
+                0; // exponent_diff=0 does not mean there is no difference for this case,
+            // act_exponent could be larger. Just that it does not need shift mantissa
+        }
+        mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
+    }
+    // need to know whether the number is right in the middle of two adjacent fp8 numbers. use  max
+    // value of 31 to avoid undefined behaviour
+    bool midpoint = (mantissa & ((1u << (mfmt - Wm + exponent_diff)) - 1)) ==
+                    (1u << (mfmt - Wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we
+    shift right as shift right could rip off some residual part and make something not midpoint look
+    like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than
+    midpoint, but after shift right by 4 bits, it would look like midpoint.
+    */
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1 << mfmt);
+    // if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+    // Now we have the exponent and mantissa adjusted
+    uint32_t drop_mask = (1 << (mfmt - Wm)) - 1;
+    bool odd =
+        mantissa & (1 << (mfmt - Wm)); // if the least significant bit that is not truncated is 1
+    /*
+    This part is doing rounding by adding mantissa part that is going to get dropped.
+    e.g. if the dropped part for less than 0.5 than it would round down.
+    if the dropped part is more than 0.5 then it would round up by rolling carry to LSB of retained
+    mantissa.
+    For the mid point when bit pattern is like this for Odd: `xy1:10000000` for Odd and
+    `xy0:10000000` for the Even.  where `:` is delimiter for dropped v/s retained part.
+    For the odd case :
+    this will add xy1:10000000 + 000:10000000 which would roll over carry to LSB of retained
+    part making it RNE.
+    For the even case : this will add xy0:10000000 + 000:01111111 which would
+    round down and keep number Even
+    */
+    mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
+    // Now we deal with overflow
+    if(f8_exponent == 0 and ((1 << mfmt) & mantissa))
+    {
+        f8_exponent = 1; // denormal overflow to become normal, promote exponent
+    }
+    else if((1 << (mfmt + 1)) & mantissa)
+    {
+        mantissa >>= 1;
+        f8_exponent++;
+    }
+    mantissa >>= (mfmt - Wm);
+    // above range: quantize to maximum possible float of the same sign
+    // for e5m2 case, max_exp is 14, since exp = 15 is reserved for Infs and Nans
+    const int max_exp = (1 << We) - ((NegativeZeroNan or Wm == 3) ? 1 : 2);
+    if(f8_exponent > max_exp)
+    {
+        if(Clip)
+            return signed_max;
+        else
+        {
+            // https://onnx.ai/onnx/technical/float8.html#cast
+            if(NegativeZeroNan)
+                return 0x80;
+            else
+                return (Wm == 2) ? signed_inf : signed_all_ones;
+        }
+    }
+    if(f8_exponent == 0 and mantissa == 0)
+        return NegativeZeroNan ? 0 : (sign << 7);
+    mantissa &= (1 << Wm) - 1;
+    return (sign << 7) | (f8_exponent << Wm) | mantissa;
+}
+// NOLINTEND
+template <int Wm, int We, typename T, bool NegativeZeroNan>
+__device__ constexpr T cast_from_f8(uint8_t x)
+{
+    // half is not supported for now
+    constexpr bool is_half  = false;
+    constexpr bool is_float = true;
+    static_assert(is_float or is_half, "Only float are supported");
+    constexpr int weo = is_half ? 5 : 8;
+    constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
+    // NOLINTNEXTLINE
+    T f_inf, f_neg_inf, f_nan, f_neg0;
+    if constexpr(is_float)
+    {
+        const uint32_t if_inf     = 0x7F800000;
+        const uint32_t if_neg_inf = 0xFF800000;
+        const uint32_t if_nan     = 0x7F800001;
+        const uint32_t if_neg0    = 0x80000000;
+        f_inf                     = migraphx::bit_cast<float>(if_inf);
+        f_neg_inf                 = migraphx::bit_cast<float>(if_neg_inf);
+        f_nan                     = migraphx::bit_cast<float>(if_nan);
+        f_neg0                    = migraphx::bit_cast<float>(if_neg0);
+    }
+    if(x == 0)
+        return 0;
+    uint32_t sign     = x >> 7;              // NOLINT
+    uint32_t mantissa = x & ((1 << Wm) - 1); // NOLINT
+    int exponent      = (x & 0x7F) >> Wm;    // NOLINT
+    if(NegativeZeroNan)
+    {
+        if(x == 0x80)
+            return f_nan;
+    }
+    else
+    {
+        if(x == 0x80)
+            return f_neg0;
+        if(exponent == ((1 << We) - 1) and Wm == 2) // NOLINT
+            return (mantissa == 0) ? (sign ? f_neg_inf : f_inf) : f_nan;
+        else if(Wm == 3 and (x == 0x7F or x == 0xFF))
+            return f_nan;
+    }
+    typename migraphx::conditional_t<sizeof(T) == 2, uint16_t, uint32_t> retval;
+    const int exp_low_cutoff =
+        (1 << (weo - 1)) - (1 << (We - 1)) + 1 - (NegativeZeroNan ? 1 : 0); // NOLINT
+    // subnormal input
+    if(exponent == 0)
+    {
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __builtin_clz(mantissa) - (32 - Wm);
+        mantissa <<= sh;             // NOLINT
+        exponent += 1 - sh;
+        mantissa &= ((1 << Wm) - 1); // NOLINT
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - Wm; // NOLINT
+    // subnormal output (occurs when T=half, We=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;      // NOLINT
+        mantissa >>= 1 - exponent; // NOLINT
+        exponent = 0;
+    }
+    if(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa; // NOLINT
+    else
+        retval = (sign << 31) | (exponent << 23) | mantissa; // NOLINT
+    return migraphx::bit_cast<T>(retval);
+}
+} // namespace impl
+} // namespace fp8
+} // namespace migraphx
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+#endif // MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -52,22 +52,25 @@ __device__ void generic_binary_layernorm(
    block::template run<reduce_output>([&](auto, auto r) {
        auto input       = r.inner([&](auto x1, auto x2) { return op(x1, x2); })(input1, input2);
        using value_type = typename Input1::type;
+        using vec_value_type       = vec_type<value_type>;
        constexpr auto relements   = r.template elements<Input1>();
-        constexpr auto relements_r = vec_type<value_type>{1.0 / relements};
+        constexpr auto relements_r = vec_value_type{1.0 / relements};
        auto relements_rsqrt       = sqrt(relements_r);
-        auto means = r.reduce(op::sum{}, make_array<vec_type<value_type>>(0, 0), [&](auto x) {
+        auto means = r.reduce(op::sum{},
-            auto x_out = x * relements_r;
+                              make_array<vec_value_type>(vec_value_type{0}, vec_value_type{0}),
-            // dividing x by sqrt(relements) before squaring allows computing higher values
+                              [&](auto x) {
-            // before overflow in low precision
+                                  auto x_out = x * relements_r;
-            auto x2_sqrt = x * relements_rsqrt;
+                                  // dividing x by sqrt(relements) before squaring allows computing
-            return make_array(x_out, x2_sqrt * x2_sqrt);
+                                  // higher values before overflow in low precision
-        })(input);
+                                  auto x2_sqrt = x * relements_rsqrt;
+                                  return make_array(x_out, x2_sqrt * x2_sqrt);
+                              })(input);
        auto mean_x        = means[0];
        auto mean_x2       = means[1];
        auto variance      = mean_x2 - (mean_x * mean_x);
-        value_type eps_val = eps; // implicit conversion for eps
+        value_type eps_val = implicit_conversion(eps);
        r.inner([&](auto& y, auto x, auto... xs) {
            auto m = x - mean_x;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -29,11 +29,15 @@
 #include <migraphx/kernels/functional.hpp>
 #include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/hip.hpp>
+#include <migraphx/kernels/float8.hpp>
 namespace migraphx {
 namespace math {
 constexpr float as_float(migraphx::half x) { return x; }
+constexpr float as_float(migraphx::fp8::fp8e4m3fnuz x) { return x; }
 template <class T>
 constexpr T as_float(T x)
 {
@@ -57,14 +61,14 @@ constexpr T as_float(T x)
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname)                    \
    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
-    auto __device__ name(type x, Ts... xs)->type                       \
+    auto __device__ name(type x, Ts... xs) -> type                     \
    {                                                                  \
        return fname(x, xs...);                                        \
    }
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_BINARY_FOR(type, name, fname) \
-    inline auto __device__ name(type x, type y)->type { return fname(x, y); }
+    inline auto __device__ name(type x, type y) -> type { return fname(x, y); }
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_HALF(name, fname)                         \
@@ -72,6 +76,12 @@ constexpr T as_float(T x)
    auto __device__ name(migraphx::half x, Ts... xs)                   \
        MIGRAPHX_RETURNS(fname(math::as_float(x), math::as_float(xs)...))
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_FP8(name, fname)                                      \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())>             \
+    auto __device__ name(migraphx::fp8::fp8e4m3fnuz x, Ts... xs) MIGRAPHX_RETURNS( \
+        migraphx::fp8::fp8e4m3fnuz(fname(math::as_float(x), math::as_float(xs)...)))
 // Template with two overloads for math functions, one for half2 type and one for more generic
 // <half, N> vectorization where N is 4 or another even number.
@@ -162,6 +172,33 @@ MIGRAPHX_DEVICE_MATH_HALF(tan, ::tan)
 MIGRAPHX_DEVICE_MATH_HALF(tanh, ::tanh)
 MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)
+// use float to compute fp8 overload
+MIGRAPHX_DEVICE_MATH_FP8(abs, ::abs)
+MIGRAPHX_DEVICE_MATH_FP8(acos, ::acos)
+MIGRAPHX_DEVICE_MATH_FP8(acosh, ::acosh)
+MIGRAPHX_DEVICE_MATH_FP8(asin, ::asin)
+MIGRAPHX_DEVICE_MATH_FP8(asinh, ::asinh)
+MIGRAPHX_DEVICE_MATH_FP8(atan, ::atan)
+MIGRAPHX_DEVICE_MATH_FP8(atanh, ::atanh)
+MIGRAPHX_DEVICE_MATH_FP8(ceil, ::ceil)
+MIGRAPHX_DEVICE_MATH_FP8(cos, ::cos)
+MIGRAPHX_DEVICE_MATH_FP8(cosh, ::cosh)
+MIGRAPHX_DEVICE_MATH_FP8(erf, ::erf)
+MIGRAPHX_DEVICE_MATH_FP8(exp, ::exp)
+MIGRAPHX_DEVICE_MATH_FP8(floor, ::floor)
+MIGRAPHX_DEVICE_MATH_FP8(isnan, ::isnan)
+MIGRAPHX_DEVICE_MATH_FP8(log, ::log)
+MIGRAPHX_DEVICE_MATH_FP8(pow, ::pow)
+MIGRAPHX_DEVICE_MATH_FP8(remainder, ::remainder)
+MIGRAPHX_DEVICE_MATH_FP8(round, ::round)
+MIGRAPHX_DEVICE_MATH_FP8(rsqrt, ::rsqrt)
+MIGRAPHX_DEVICE_MATH_FP8(sin, ::sin)
+MIGRAPHX_DEVICE_MATH_FP8(sinh, ::sinh)
+MIGRAPHX_DEVICE_MATH_FP8(sqrt, ::sqrt)
+MIGRAPHX_DEVICE_MATH_FP8(tan, ::tan)
+MIGRAPHX_DEVICE_MATH_FP8(tanh, ::tanh)
+MIGRAPHX_DEVICE_MATH_FP8(fmod, ::fmod)
 // Map math functions to hip half2 functions
 // The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
 // packed into a 32-bit number.  See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
@@ -253,7 +290,7 @@ MIGRAPHX_DEVICE_MATH_VEC(where)
 template <class T, class U>
 constexpr auto convert(U v)
 {
-    return vec_transform(v)([](auto x) -> T { return x; });
+    return vec_transform(v)([](auto x) -> T { return static_cast<T>(x); });
 }
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/algorithm.hpp>
 #include <migraphx/kernels/ranges.hpp>
+#include <migraphx/kernels/vec.hpp>
 namespace migraphx {
@@ -53,9 +54,9 @@ __device__ void pad(const index& idx,
        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
           }))
-            output[multi] = pad_val;
+            output[multi] = implicit_conversion(pad_val);
        else
-            output[multi] = input[input_idx];
+            output[multi] = implicit_conversion(input[input_idx]);
    });
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -106,7 +106,7 @@ __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 #endif
    using type = decltype(index::invoke_loop(f, 0, _c<0>));
    __shared__ type buffer[idx.max_nlocal() / lanes_per_thread];
-    type x = init;
+    type x = type(init);
    idx.local_stride(n, [&](auto i, auto d) { x = op(x, index::invoke_loop(f, i, d)); });
    dpp_reduce(x, op);
@@ -117,7 +117,7 @@ __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
    }
    __syncthreads();
-    type y = init;
+    type y = type(init);
    for(index_int i = 0; i < idx.nlocal() / lanes_per_thread; i++)
    {
        y = op(y, buffer[i]);
@@ -244,9 +244,8 @@ struct reducer_base
        {
            auto&& derived = static_cast<const Derived&>(*this);
            auto t         = derived.slice(x);
-            return make_storage_access<typename decltype(t)::type>([=](auto i, auto...) -> auto& {
+            return make_storage_access<typename decltype(t)::type>(
-                return t[i];
+                [=](auto i, auto...) -> auto& { return t[i]; });
-            });
        }
    }
@@ -393,7 +392,7 @@ struct block
        {
            using max_iterations = decltype(idx.max_local_stride_iterations(n));
            inner_storage<R, max_iterations{}, N> storage;
-            idx.local_stride(n, [&](auto j, auto d) { storage(j, d) = f(xs(j, d)...); });
+            idx.local_stride(n, [&](auto j, auto d) { storage(j, d) = R{f(xs(j, d)...)}; });
            return storage;
        }
    };
@@ -482,7 +481,7 @@ struct lane
        __device__ auto reduce_impl(Op op, T init, Read read, N n, U&& x, Us&&... xs) const
        {
            using type = remove_reference_t<decltype(x(0, _c<0>))>;
-            type r     = init;
+            type r     = type(init);
            for(index_int j = 0; j < n; j++)
            {
                r = op(r, read(x(j, _c<0>), xs(j, _c<0>)...));

--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -62,7 +62,7 @@ struct avg_pool
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int y)
    {
-        return (y == 0) ? 0.0 : (x / y);
+        return (y == 0) ? T{0.0} : T{x / y};
    }
 };
@@ -76,7 +76,7 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
    {
        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
        {
-            return 0;
+            return implicit_conversion(0);
        }
        xy[ii]   = migraphx::max(xy[ii], 0.0f);
@@ -92,15 +92,16 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
                                high[0] * dims[1] + low[1],
                                high[0] * dims[1] + high[1]};
-    float ly                                   = xy[0] - low[0];
+    float ly = xy[0] - low[0];
-    float lx                                   = xy[1] - low[1];
+    float lx = xy[1] - low[1];
-    float hy                                   = 1.0f - ly;
+    float hy = 1.0f - ly;
-    float hx                                   = 1.0f - lx;
+    float hx = 1.0f - lx;
-    array<typename Iterator::value_type, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+    // do calculations in floating point and convert final result to required type
+    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
-    return pooling(v01, v23);
+    return implicit_conversion(pooling(v01, v23));
 }
 template <class Iterator, class Op>
@@ -113,8 +114,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
                                            float roi_offset,
                                            Op op)
 {
-    typename Iterator::value_type output_val = op.init();
+    using in_dtype      = typename Iterator::value_type;
-    const int64_t count                      = bin_grid_size[0] * bin_grid_size[1];
+    in_dtype output_val = in_dtype{op.init()};
+    const int64_t count = bin_grid_size[0] * bin_grid_size[1];
    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
        array<index_int, 2> id = {iy, ix};
        array<float, 2> locs =
@@ -148,7 +150,6 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
    const auto x    = x_t.begin();
    const auto rois = rois_t.begin();
    const auto ind  = ind_t.begin();
    // input shape
    auto x_lens      = x_t.get_shape().lens;
    auto channel_num = x_lens[1];
@@ -176,10 +177,12 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
        const auto offset_rois = rois + (n * roi_column_num);
        const int batch_ind    = ind[n];
-        array<float, 2> roi_starts = {offset_rois[1] * s.spatial_scale,
+        array<float, 2> roi_starts = {
-                                      offset_rois[0] * s.spatial_scale};
+            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale),
-        array<float, 2> roi_ends   = {offset_rois[3] * s.spatial_scale,
+            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale)};
-                                    offset_rois[2] * s.spatial_scale};
+        array<float, 2> roi_ends = {
+            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale),
+            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale)};
        array<float, 2> roi_size{};
        array<float, 2> bin_size{};

--- a/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
@@ -43,7 +43,7 @@ __device__ void softmax(Input input1, Output output)
        auto exp_in = r.inner([&](auto x) { return migraphx::exp(x - c); })(input);
        auto batch_sum =
            r.reduce(op::sum{}, 0, [](auto x) { return migraphx::convert<float>(x); })(exp_in);
-        r.inner([&](auto& y, auto x) { y = x / batch_sum; })(output, exp_in);
+        r.inner([&](auto& y, auto x) { y = implicit_conversion(x / batch_sum); })(output, exp_in);
    });
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -27,6 +27,7 @@
 #include <migraphx/kernels/shape.hpp>
 #include <migraphx/kernels/debug.hpp>
 #include <migraphx/kernels/iota_iterator.hpp>
+#include <migraphx/kernels/float8.hpp>
 namespace migraphx {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -251,7 +251,7 @@ constexpr T numeric_max()
 }
 template <class T>
-constexpr T numeric_lowest()
+constexpr auto numeric_lowest() -> decltype(numeric_max<T>())
 {
    if constexpr(is_integral<T>{})
    {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -207,7 +207,7 @@ struct implicit_conversion_op
    template <class U>
    constexpr operator U() const
    {
-        return x;
+        return static_cast<U>(x);
    }
 };

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -98,6 +98,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    ctx.set_exhaustive_tune_flag(options.exhaustive_tune);
    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
    unsupported_types.erase(shape::type_t::float_type);
+    unsupported_types.erase(shape::type_t::fp8e4m3fnuz_type);
    unsupported_types.erase(shape::type_t::half_type);
    unsupported_types.erase(shape::type_t::bool_type);
    unsupported_types.erase(shape::type_t::int8_type);

--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -350,18 +350,19 @@ TEST_CASE(compile_math)
    auto vec_sizes = {2, 4, 6};
    for(auto&& t : migraphx::shape::types())
    {
-        if(contains({migraphx::shape::bool_type,
+        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
-                     migraphx::shape::fp8e4m3fnuz_type,
-                     migraphx::shape::tuple_type},
-                    t))
            continue;
        auto name = migraphx::shape::cpp_type(t);
        if(t == migraphx::shape::half_type)
            name.insert(0, "migraphx::");
        data_types.push_back(name);
-        migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
+        // fp8 doesn't have vectorization support yet, therefore skip it for now.
-            return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
+        if(t != migraphx::shape::fp8e4m3fnuz_type)
-        });
+        {
+            migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
+                return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
+            });
+        }
    }
    migraphx::shape input{migraphx::shape::float_type, {5, 2}};
    migraphx::gpu::hip_compile_options options;
@@ -431,7 +432,6 @@ TEST_CASE(assert_type_min_max)
                min = std::to_string(as.min());
                max = std::to_string(as.max());
            }
            auto src = migraphx::interpolate_string(assert_template,
                                                    {{"type", name}, {"max", max}, {"min", min}});
            migraphx::shape input{migraphx::shape::float_type, {5, 2}};

--- a/test/simplify_algebra_test.cpp
+++ b/test/simplify_algebra_test.cpp
@@ -1017,6 +1017,40 @@ TEST_CASE(simplify_concat_add_relu_broadcast_same_axis)
    EXPECT(m1 == m2);
 }
+TEST_CASE(concat_convert_fusion)
+{
+    auto s = migraphx::shape{migraphx::shape::float_type, {64}};
+    migraphx::module m1;
+    {
+        auto x  = m1.add_parameter("x", s);
+        auto y  = m1.add_parameter("y", s);
+        auto xh = m1.add_instruction(
+            migraphx::make_op("convert",
+                              {{"target_type", migraphx::to_value(migraphx::shape::half_type)}}),
+            x);
+        auto yh = m1.add_instruction(
+            migraphx::make_op("convert",
+                              {{"target_type", migraphx::to_value(migraphx::shape::half_type)}}),
+            y);
+        auto concat = m1.add_instruction(migraphx::make_op("concat", {{"axis", 0}}), xh, yh);
+        m1.add_instruction(pass_op{}, concat);
+    }
+    run_pass(m1);
+    migraphx::module m2;
+    {
+        auto x       = m2.add_parameter("x", s);
+        auto y       = m2.add_parameter("y", s);
+        auto concat  = m2.add_instruction(migraphx::make_op("concat", {{"axis", 0}}), x, y);
+        auto concath = m2.add_instruction(
+            migraphx::make_op("convert",
+                              {{"target_type", migraphx::to_value(migraphx::shape::half_type)}}),
+            concat);
+        m2.add_instruction(pass_op{}, concath);
+    }
+    EXPECT(m1 == m2);
+}
 TEST_CASE(simplify_div_const)
 {
    migraphx::module m1;

--- a/test/verify/test_abs.cpp
+++ b/test/verify/test_abs.cpp
@@ -27,14 +27,19 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>
-struct test_abs : verify_program<test_abs>
+template <migraphx::shape::type_t DType>
+struct test_abs : verify_program<test_abs<DType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
        auto* mm = p.get_main_module();
-        auto x = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {4, 3, 3, 3}});
+        auto x   = mm->add_parameter("x", migraphx::shape{DType, {4, 3, 3, 3}});
        mm->add_instruction(migraphx::make_op("abs"), x);
        return p;
    }
 };
+template struct test_abs<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_abs<migraphx::shape::half_type>;
+template struct test_abs<migraphx::shape::float_type>;