port gpu changes

a9dd42f7 · Umang Yadav · 9e6d866d · a9dd42f7 · a9dd42f7 · a9dd42f7
Commit a9dd42f7 authored Nov 17, 2023 by Umang Yadav
17 changed files
--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -54,6 +54,11 @@ vectorize vectorize::elements(std::size_t axis,
                              const std::vector<shape>& inputs,
                              const std::vector<std::size_t>& sizes)
 {
+    // disable vectorization for fp8 types
+    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
+           return ishape.type() == migraphx::shape::fp8e4m3fnuz_type;
+       }))
+        return {1, axis};
    if(std::all_of(
           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
        return {1, axis};
@@ -86,6 +91,11 @@ vectorize vectorize::elements(std::size_t axis,

 vectorize vectorize::elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs)
 {
+    // disable vectorization for fp8 types
+    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
+           return ishape.type() == migraphx::shape::fp8e4m3fnuz_type;
+       }))
+        return {1, axis};
    if(inputs.empty())
        return {1, axis};
    std::size_t n = std::max_element(inputs.begin(),
@@ -305,7 +315,7 @@ std::string generate_reduce(const module& m, const std::string& name)
            std::transform(
                params.begin(), params.end(), params.begin(), [](auto s) { return "auto " + s; });
            return interpolate_string(inner_template,
-                                      {{"inner", inner_name},
+                                             {{"inner", inner_name},
                                       {"params", join_strings(params, ", ")},
                                       {"args", join_strings(args, ", ")},
                                       {"call", call_function}});

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -199,7 +199,7 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr
 {
    hiprtc_program prog(std::move(srcs));
    auto options = split_string(params, ' ');
-    options.push_back("-DMIGRAPHX_USE_HIPRTC=1");
+    options.push_back("-DMIGRAPHX_JIT_USE_HIPRTC=1");
    // remove following three compilation flags for HIPRTC once fixes from hipRTC are available in
    if(enabled(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS{}))
    {
@@ -251,21 +251,10 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
            std::cout << std::string(src.content) << std::endl;
        }
    }
-    auto fname = fs::path{"migraphx-hiprtc-driver"};
-#ifdef _WIN32
-    fname.replace_extension(".exe");
-#endif
    auto p      = dynamic_loader::path(&compile_hip_src_with_hiprtc);
-    auto driver = p.parent_path() / fname;
-
-    bool found = fs::exists(driver);
-    if(not found)
-    {
-        driver = p.parent_path().parent_path() / "bin" / fname;
-        found  = fs::exists(driver);
-    }
+    auto driver = p.parent_path().parent_path() / "bin" / "migraphx-hiprtc-driver";

-    if(found)
+    if(fs::exists(driver))
    {
        value v;
        v["srcs"]   = to_value(hsrcs);

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -197,6 +197,7 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option

    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
    options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
+    options.params += " -D__HIP_NO_F8_CONVERSIONS__=1";
    options.params += " " + join_strings(compiler_warnings(), " ");
    options.params += " -ftemplate-backtrace-limit=0";
    options.params += " -Werror";

--- a/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+#ifndef MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+#define MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+
+namespace migraphx {
+template <typename To, typename From>
+inline constexpr To bit_cast(From fr) noexcept
+{
+    static_assert(sizeof(To) == sizeof(From));
+    return __builtin_bit_cast(To, fr);
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef MIGRAPHX_GUARD_KERNELS_FLOAT8_HPP
+#define MIGRAPHX_GUARD_KERNELS_FLOAT8_HPP
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wfloat-equal"
+#pragma clang diagnostic ignored "-Wmacro-redefined"
+#pragma clang diagnostic ignored "-Wc++20-extensions"
+#endif // __clang__
+
+#if(defined(__HIP_PLATFORM_HCC__) || defined(__HIP_PLATFORM_AMD__))
+// need to include hip_runtime.h otherwise it complains about __host__ and __device__
+#if defined(MIGRAPHX_JIT_USE_HIPRTC)
+#include <migraphx/kernels/hip.hpp>
+#else
+#include <hip/hip_runtime.h>
+#endif
+#define MIGRAPHX_HIP_HOST_DEVICE __host__ __device__
+#define MIGRAPHX_HIP_HOST __host__
+#else
+#define MIGRAPHX_HIP_HOST_DEVICE
+#define MIGRAPHX_HIP_HOST
+#endif // HIP_PLATFORM_AMD
+
+#define MIGRAPHX_HIP_DEVICE __device__
+
+#ifndef MIGRAPHX_FP8_FNUZ
+#define MIGRAPHX_FP8_FNUZ true
+#endif // MIGRAPHX_FP8_FNUZ
+
+// We are clipping in down conversion by default
+#define MIGRAPHX_F8_DOWNCAST_CLIPPING 1
+#if defined(MIGRAPHX_JIT_USE_HIPRTC)
+#include <migraphx/kernels/types.hpp>
+using uint8_t  = migraphx::uint8_t;
+using uint16_t = migraphx::uint16_t;
+using uint32_t = migraphx::uint32_t;
+#else
+#include <cmath>
+#include <cstdint>
+#include <climits>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <sstream>
+#include <iostream>
+#include <string>
+#include <utility>
+#endif
+
+#include <migraphx/kernels/float8_impl.hpp>
+
+namespace migraphx {
+namespace fp8 {
+
+enum class rounding_mode
+{
+    standard, // standard rounding is doing RNE -- round to nearest even
+    stochastic
+};
+
+enum class f8_type
+{
+    bf8 = 0, // s1e5m2
+    fp8 = 1  // s1e4m3
+};
+
+template <typename T>
+class numeric_limits;
+
+template <migraphx::fp8::f8_type T = migraphx::fp8::f8_type::fp8>
+struct float8
+{
+    uint8_t data;
+    // default constructor
+    MIGRAPHX_HIP_HOST_DEVICE constexpr float8() = default;
+    // default copy constructor
+    MIGRAPHX_HIP_HOST_DEVICE constexpr float8(const float8<T>& y) = default;
+    struct from_bits_t
+    {
+    };
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE from_bits_t from_bits() { return from_bits_t(); }
+
+    MIGRAPHX_HIP_HOST_DEVICE explicit constexpr float8(uint8_t bits, from_bits_t) : data(bits) {}
+
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // device specific optimized F8 down-conversion code
+
+    template <bool stochastic_rounding = false>
+    static MIGRAPHX_HIP_DEVICE uint8_t cast_to_f8_from_f32(float v, uint32_t rng = 0)
+    {
+        uint8_t i8data;
+        union
+        {
+            float fval;
+            uint32_t i32val;
+            uint8_t i8val[4]; // NOTE: not endian independent
+        } val;
+
+        uint32_t ival = 0;
+        val.fval      = v;
+
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000) /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
+        }
+        else
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000) // propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0);
+        }
+#endif
+        if(stochastic_rounding)
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                ival = __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0); // 0 pos
+            }
+            else
+            {
+                ival = __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
+            }
+        }
+        else // RNE CVT
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                ival = __builtin_amdgcn_cvt_pk_fp8_f32(
+                    val.fval, val.fval, ival, false); // false -> WORD0
+            }
+            else
+            {
+                ival = __builtin_amdgcn_cvt_pk_bf8_f32(
+                    val.fval, val.fval, ival, false); // false -> WORD0}
+            }
+        }
+        val.i32val = ival;
+        i8data     = val.i8val[0]; // little endian
+
+        return i8data;
+    }
+#endif // __gfx940__
+
+       // constructor from float
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+
+    // NOTE: ON-DEVICE... always optimal bias
+    explicit MIGRAPHX_HIP_DEVICE
+    float8(float v,
+           migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
+           uint32_t rng                    = 0)
+    {
+        // runtime branch, use cast_to_f8_from_f32 if want to avoid it
+        if(rm == migraphx::fp8::rounding_mode::stochastic)
+            data = cast_to_f8_from_f32<true>(v, rng);
+        else
+            data = cast_to_f8_from_f32<false>(v);
+    }
+
+    // Host only implementation using s/w simulation
+    explicit MIGRAPHX_HIP_HOST
+#else
+    // both Host and DEVICE for non-gfx940 using s/w simulation
+    explicit constexpr MIGRAPHX_HIP_HOST_DEVICE
+#endif
+    float8(float v,
+           migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
+           uint32_t rng                    = 0)
+    {
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<3, 4, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<3, 4, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_F8_DOWNCAST_CLIPPING
+        }
+        else
+        {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<2, 5, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<2, 5, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // rocblas_F8_downcast_clipping}
+        }
+    }
+
+    /*
+        // Constructor from half
+        explicit constexpr MIGRAPHX_HIP_HOST_DEVICE
+        float8(migraphx::half v,
+               migraphx::fp8::rounding_mode rm =
+                   migraphx::fp8::rounding_mode::standard,
+               uint32_t rng = 0)
+            : float8((float)v, rm, rng)
+        {
+        }
+
+    // constructor from int
+    explicit constexpr MIGRAPHX_HIP_HOST_DEVICE
+    float8(int v,
+           migraphx::fp8::rounding_mode rm =
+               migraphx::fp8::rounding_mode::standard,
+           uint32_t rng = 0)
+        : float8((float)v, rm, rng)
+    {
+    }
+
+    // constructor from double
+    explicit constexpr MIGRAPHX_HIP_HOST_DEVICE
+    float8(double v,
+           migraphx::fp8::rounding_mode rm =
+               migraphx::fp8::rounding_mode::standard,
+           uint32_t rng = 0)
+        : float8((float)v, rm, rng)
+    {
+    }
+    */
+    /**/
+    // convert to float
+// #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#if 0 // need constexpr operator(). This version can't be constexpr
+    // upcast using device specific intrinsic
+    inline MIGRAPHX_HIP_DEVICE operator float() const
+    {
+        float fval;
+        uint32_t i32val = static_cast<uint32_t>(data);
+
+        // upcast
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+            asm volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
+        }
+        else
+        {
+            asm volatile("v_cvt_f32_bf8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
+        }
+
+        return fval;
+    }
+
+    inline constexpr MIGRAPHX_HIP_HOST operator float() const
+#else // non gfx940
+    inline constexpr MIGRAPHX_HIP_HOST_DEVICE operator float() const
+#endif
+    {
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+            return migraphx::fp8::impl::
+                cast_from_f8<3, 4, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/>(data);
+        } // else
+        return migraphx::fp8::impl::
+            cast_from_f8<2, 5, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/>(data);
+    }
+
+    /*
+        // convert to half
+        explicit inline MIGRAPHX_HIP_HOST_DEVICE operator migraphx::half() const
+        {
+            return migraphx::half(float(*this)); // convert to float, then convert to f16
+        }
+    */
+
+    // check for zero
+    inline MIGRAPHX_HIP_HOST_DEVICE constexpr bool is_zero() const
+    {
+        if constexpr(MIGRAPHX_FP8_FNUZ)
+        {
+            return data == 0x00;
+        }
+        else
+        {
+            return (data == 0x00) || (data == 0x80);
+        }
+    }
+
+    // check for nan
+    inline MIGRAPHX_HIP_HOST_DEVICE constexpr bool is_nan() const
+    {
+        if constexpr(MIGRAPHX_FP8_FNUZ)
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == migraphx::fp8::f8_type::bf8)
+            {
+                return (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xfd) ||
+                       (data == 0xfe) || (data == 0xff);
+            }
+            else
+            {
+                return (data == 0x79) || (data == 0x7a) || (data == 0x7b) || (data == 0x7c) ||
+                       (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xf9) ||
+                       (data == 0xfa) || (data == 0xfb) || (data == 0xfc) || (data == 0xfd) ||
+                       (data == 0xfe) || (data == 0xff);
+            }
+        }
+    }
+
+    // check for inf
+    inline MIGRAPHX_HIP_HOST_DEVICE constexpr bool is_inf() const
+    {
+        if constexpr(MIGRAPHX_FP8_FNUZ)
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == migraphx::fp8::f8_type::bf8)
+            {
+                return (data == 0x7c) || (data == 0xfc);
+            }
+            else
+            {
+                return (data == 0x78) || (data == 0xf8);
+            }
+        }
+    }
+
+#define MIGRAPHX_FP8_UNARY_OP(unary_op, binary_op)                                    \
+    constexpr float8& MIGRAPHX_HIP_HOST_DEVICE operator unary_op(const float8& rhs)   \
+    {                                                                                 \
+        const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
+        *this          = static_cast<float8>(tmp);                                    \
+        return *this;                                                                 \
+    }                                                                                 \
+    constexpr float8& MIGRAPHX_HIP_HOST_DEVICE operator unary_op(const float& rhs)    \
+    {                                                                                 \
+        const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
+        *this          = static_cast<float8>(tmp);                                    \
+        return *this;                                                                 \
+    }
+
+    MIGRAPHX_FP8_UNARY_OP(*=, *)
+    MIGRAPHX_FP8_UNARY_OP(-=, -)
+    MIGRAPHX_FP8_UNARY_OP(+=, +)
+    MIGRAPHX_FP8_UNARY_OP(/=, /)
+
+    inline MIGRAPHX_HIP_HOST_DEVICE constexpr float8& operator=(const float8& rhs) = default;
+    inline MIGRAPHX_HIP_HOST_DEVICE constexpr float8& operator=(float8&& rhs)      = default;
+
+#if !defined(__HIP_NO_F8_CONVERSIONS__)
+    // for the device kernels, this needs to be disabled since implicit_conversion op can type cast
+    // any type to any other type and that results in conflicts in candidate overload resolutions.
+    inline constexpr float8& MIGRAPHX_HIP_HOST_DEVICE operator=(float rhs)
+    {
+        *this = static_cast<float8>(rhs);
+        return *this;
+    }
+#endif
+
+    inline MIGRAPHX_HIP_HOST_DEVICE constexpr bool operator==(const float8& rhs) const
+    {
+        if((rhs.is_zero() && this->is_zero()) ||
+           (fabs(rhs - *this) < migraphx::fp8::numeric_limits<float8<T>>::epsilon()))
+            return true;
+        else if(rhs.is_nan() || rhs.is_inf() || this->is_nan() || this->is_inf())
+            return false;
+
+        return false;
+    }
+
+    inline MIGRAPHX_HIP_HOST_DEVICE constexpr bool operator<(const float8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we < them;
+    }
+
+    inline MIGRAPHX_HIP_HOST_DEVICE constexpr bool operator>(const float8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we > them;
+    }
+};
+
+#ifndef MIGRAPHX_JIT_USE_HIPRTC
+// Special operator overloading
+template <migraphx::fp8::f8_type T>
+inline std::ostream& operator<<(std::ostream& os, const migraphx::fp8::float8<T>& rhs)
+{
+    return os << static_cast<float>(rhs);
+}
+#endif
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_BINARY_OP(binary_op, U)                                      \
+    template <migraphx::fp8::f8_type T>                                           \
+    inline constexpr U MIGRAPHX_HIP_HOST_DEVICE operator binary_op(               \
+        const migraphx::fp8::float8<T>& lhs, const migraphx::fp8::float8<T>& rhs) \
+    {                                                                             \
+        return U(static_cast<float>(lhs) binary_op static_cast<float>(rhs));      \
+    }
+
+// TODO: these should return floats
+MIGRAPHX_FP8_BINARY_OP(*, migraphx::fp8::float8<T>)
+MIGRAPHX_FP8_BINARY_OP(-, migraphx::fp8::float8<T>)
+MIGRAPHX_FP8_BINARY_OP(/, migraphx::fp8::float8<T>)
+MIGRAPHX_FP8_BINARY_OP(+, migraphx::fp8::float8<T>)
+// TODO: Comparison ops shouldn't convert to float, maybe need to take care of rounding effects.
+MIGRAPHX_FP8_BINARY_OP(==, bool)
+MIGRAPHX_FP8_BINARY_OP(>=, bool)
+MIGRAPHX_FP8_BINARY_OP(<=, bool)
+MIGRAPHX_FP8_BINARY_OP(>, bool)
+MIGRAPHX_FP8_BINARY_OP(<, bool)
+MIGRAPHX_FP8_BINARY_OP(!=, bool)
+
+template <migraphx::fp8::f8_type T>
+inline MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<T> fabs(migraphx::fp8::float8<T> v)
+{
+    v.data = v.data & 0x7f;
+    return v;
+}
+
+template <class T>
+MIGRAPHX_HIP_HOST_DEVICE constexpr T F8_Max()
+{
+    return T{0x7F, T::from_bits()};
+}
+
+template <class T>
+MIGRAPHX_HIP_HOST_DEVICE constexpr T F8_Lowest()
+{
+    return T{0xFF, T::from_bits()};
+}
+
+using fp8e4m3fnuz = float8<migraphx::fp8::f8_type::fp8>;
+
+template <>
+class numeric_limits<migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>>
+{
+    public:
+    // TODO :figure out epsilon in Hex to make it constexpr
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>
+    epsilon()
+    {
+        return migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>(
+            0x28, migraphx::fp8::float8<>::from_bits());
+    }
+
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>
+    quiet_NaN()
+    {
+        return migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>(
+            MIGRAPHX_FP8_FNUZ ? 0x80 : 0x7F, migraphx::fp8::float8<>::from_bits());
+    }
+
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>
+    max()
+    {
+        return migraphx::fp8::F8_Max<migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>>();
+    }
+
+    // TODO figure out Hex value
+    static MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::fp8> min()
+    {
+        return static_cast<migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>>(-1.0f) *
+               migraphx::fp8::F8_Max<migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>>();
+    }
+
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>
+    lowest()
+    {
+        return migraphx::fp8::F8_Lowest<migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>>();
+    }
+
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>
+    infinity()
+    {
+        return migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>(
+            MIGRAPHX_FP8_FNUZ ? 0x80 : 0x7F, migraphx::fp8::float8<>::from_bits());
+    }
+};
+
+template <>
+class numeric_limits<migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>>
+{
+    public:
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>
+    epsilon()
+    {
+        return migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>(
+            0x34, migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>::from_bits());
+    }
+
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>
+    quiet_NaN()
+    {
+        return migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>(
+            MIGRAPHX_FP8_FNUZ ? 0x80 : 0x7d,
+            migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>::from_bits());
+    }
+
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>
+    max()
+    {
+        return static_cast<migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>>(
+            migraphx::fp8::F8_Max<migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>>());
+    }
+    // TODO figure  out constexpr value
+    static MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::bf8> min()
+    {
+        return static_cast<migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>>(float(-1.0f)) *
+               migraphx::fp8::F8_Max<migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>>();
+    }
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>
+    lowest()
+    {
+        return migraphx::fp8::F8_Lowest<migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>>();
+    }
+
+    static constexpr MIGRAPHX_HIP_HOST_DEVICE migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>
+    infinity()
+    {
+        return migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>(
+            MIGRAPHX_FP8_FNUZ ? 0x80 : 0x7c,
+            migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>::from_bits());
+    }
+};
+/*
+// Use h/w intrinsic and optimized version when __gfx940__
+template <typename T,
+          typename Ta,
+          bool stochastic_rounding,
+          typename std::enable_if<(!(migraphx::is_same<T, Ta>{}) &&
+                                   (migraphx::is_same<T, migraphx_f8>{} ||
+                                    migraphx::is_same<T, migraphx_bf8>{})),
+                                  int>::type = 0>
+inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
+{
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // NOTE: we are directly calling cast_to_f8_from_f32 instead of constructor to optimize
+    // away one runtime branch
+    T val;
+    if(migraphx::is_same<T, migraphx_f8>::value)
+        val.data = migraphx_f8::cast_to_f8_from_f32<stochastic_rounding>(float(a), rng);
+    else
+        val.data = migraphx_bf8::cast_to_bf8_from_f32<stochastic_rounding>(float(a), rng);
+    return val;
+#else  // non gfx940
+    return T(float(a),
+             stochastic_rounding ? migraphx::fp8::rounding_mode::stochastic
+                                 : migraphx::fp8::rounding_mode::standard,
+             rng);
+#endif // __gfx940__
+}
+
+// NOTE NOTE: The above code is good if we don't consider HIP-GEMM code and only consider
+// the quantization However, if we need HIP-GEMM for fall-back, we would need explicit_cast
+// handles Tacc=f32 to To=f16/bf16 conversion
+template <typename T,
+          typename Ta,
+          bool stochastic_rounding,
+          typename std::enable_if<(!(migraphx::is_same<T, Ta>{}) &&
+                                   !(migraphx::is_same<T, migraphx_f8>{} ||
+                                     migraphx::is_same<T, migraphx_bf8>{})),
+                                  int>::type = 0>
+inline __host__ __device__ T explicit_downcast(Ta a, uint32_t rng)
+{
+    // the return type is not a F8 types, no SR for those types
+    // not sure if we have direct conversion, so converting to float first
+    // no effect if the input type is float
+    return T(float(a));
+}
+*/
+} // namespace fp8
+} // namespace migraphx
+// define numeric limits for the new data type
+#ifndef MIGRAPHX_JIT_USE_HIPRTC
+namespace std {
+inline bool isfinite(migraphx::fp8::float8<migraphx::fp8::f8_type::fp8> x) // NOLINT
+{
+    return x.is_inf();
+}
+
+inline bool isfinite(migraphx::fp8::float8<migraphx::fp8::f8_type::bf8> x) // NOLINT
+{
+    return x.is_inf();
+}
+
+inline bool isnan(migraphx::fp8::float8<migraphx::fp8::f8_type::fp8> x) // NOLINT
+{
+    return x.is_nan();
+}
+
+inline bool isnan(migraphx::fp8::float8<migraphx::fp8::f8_type::bf8> x) // NOLINT
+{
+    return x.is_nan();
+}
+
+template <>
+class numeric_limits<migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>>
+    : public migraphx::fp8::numeric_limits<migraphx::fp8::float8<migraphx::fp8::f8_type::fp8>>
+{
+};
+
+template <>
+class numeric_limits<migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>>
+    : public migraphx::fp8::numeric_limits<migraphx::fp8::float8<migraphx::fp8::f8_type::bf8>>
+{
+};
+
+template <class T>
+struct common_type<migraphx::fp8::fp8e4m3fnuz, T> : std::common_type<float, T> // NOLINT
+{
+};
+
+template <class T>
+struct common_type<T, migraphx::fp8::fp8e4m3fnuz> : std::common_type<float, T> // NOLINT
+{
+};
+
+template <>
+struct common_type<migraphx::fp8::fp8e4m3fnuz, migraphx::fp8::fp8e4m3fnuz>
+{
+    using type = float;
+};
+
+} // namespace std
+#endif
+// =================================================================================================
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+#endif // MIGRAPHX_GUARD_KERNELS_FLOAT8_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#define MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+
+#define CONST_FOLD(x) (__builtin_constant_p(x) ? (x) : (x))
+namespace migraphx {
+namespace detail {
+template <bool B, class T, class F>
+struct conditional
+{
+    using type = T;
+};
+
+template <class T, class F>
+struct conditional<false, T, F>
+{
+    using type = F;
+};
+
+template <typename To, typename From>
+inline constexpr To bit_cast(From fr) noexcept
+{
+    static_assert(sizeof(To) == sizeof(From));
+#if defined(__GNUC__) and !defined(__clang__)
+    To x = CONST_FOLD(*reinterpret_cast<To*>(&fr));
+#else
+    To x = __builtin_bit_cast(To, fr);
+#endif
+    return x;
+}
+} // namespace detail
+
+namespace fp8 {
+namespace impl {
+// #ifdef __HIP_PLATFORM_HCC__
+// __device__ inline int clz(uint32_t x) { return __clz(x); }
+// #else
+// __host__ inline int clz(uint32_t x) { return __builtin_clz(x); }
+// #endif
+
+template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
+MIGRAPHX_HIP_HOST_DEVICE constexpr uint8_t cast_to_f8(T _x, bool stoch, uint32_t rng)
+{
+
+    static_assert(wm + we == 7, "wm+we==7");
+
+    const int mfmt = (sizeof(T) == 4) ? 23 : 10;
+    typename migraphx::detail::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type x;
+
+    if constexpr(sizeof(T) == 4)
+        x = migraphx::detail::bit_cast<uint32_t>(_x);
+    else
+        x = migraphx::detail::bit_cast<uint16_t>(_x);
+
+    uint32_t head, mantissa;
+    int exponent, bias;
+    uint32_t sign;
+
+    if constexpr(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+    }
+
+    uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
+
+    // Deal with inf and NaNs
+    if(negative_zero_nan)
+    {
+        if(sizeof(T) == 4)
+        {
+            if((x & 0x7F800000) == 0x7F800000)
+                return 0x80;
+        }
+        else
+        {
+            // if(__hisinf(x) || __hisnan(x))
+            if((x & 0x7C00) == 0x7C00)
+                return 0x80;
+        }
+    }
+    else
+    {
+        if(sizeof(T) == 4)
+        {
+            if((x & 0x7F800000) == 0x7F800000)
+                return signed_inf + (mantissa != 0 ? 1 : 0);
+        }
+        else
+        {
+            if((x & 0x7C00) == 0x7C00)
+                return signed_inf + (mantissa != 0 ? 1 : 0);
+        }
+    }
+    // handle positive zero
+    if(x == 0)
+        return 0;
+    // handle negative zero
+    if((sizeof(T) == 4 and x == 0x80000000) or (sizeof(T) == 2 and x == 0x8000))
+    {
+        if(negative_zero_nan)
+        {
+            return 0;
+        }
+        else
+        {
+            return 0x80;
+        }
+    }
+
+    // First need to check if it is normal or denorm as there is a difference of implict 1
+    // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
+    // The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
+    // RNE, no need to add rng. Then probably need to check whether there is carry and adjust
+    // exponent and mantissa again
+
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
+    const int f8_bias                  = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    // f8_exponent is the converted f8 exponent with bias encoding
+    // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    // the difference needs to be adjusted and mantissa shifted
+    int act_exponent, f8_exponent, exponent_diff;
+
+    if(exponent == 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
+here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal has
+exponent bias 15 while bf8 with NANOO has exponent bias 16. It means that there are some numbers in
+fp16 denormal but they are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
+where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 (NANOO) normal. In
+this case, the fp16 mantissa should be shift left by 1  */
+        act_exponent  = exponent - bias + 1;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
+   For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
+   actual exponent is -7, it is actually larger due to the implict 1,
+   Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+   So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        {          // both fp32/fp16 and f8 are in normal range
+            exponent_diff =
+                0; // exponent_diff=0 does not mean there is no difference for this case,
+            // act_exponent could be larger. Just that it does not need shift mantissa
+        }
+        mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
+    }
+
+    bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) ==
+                    (1 << (mfmt - wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we
+ shift right as shift right could rip off some residual part and make something not midpoint look
+ like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than
+ midpoint, but after shift right by 4 bits, it would look like midpoint.
+*/
+
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1 << mfmt);
+    // if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+
+    // Now we have the exponent and mantissa adjusted
+    uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
+    bool odd =
+        mantissa & (1 << (mfmt - wm)); // if the least significant bit that is not truncated is 1
+    mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
+
+    // Now we deal with overflow
+    if(f8_exponent == 0)
+    {
+        if((1 << mfmt) & mantissa)
+        {
+            f8_exponent = 1; // denormal overflow to become normal, promote exponent
+        }
+    }
+    else
+    {
+        if((1 << (mfmt + 1)) & mantissa)
+        {
+            mantissa >>= 1;
+            f8_exponent++;
+        }
+    }
+
+    mantissa >>= (mfmt - wm);
+
+    // above range: quantize to maximum possible float of the same sign
+    const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
+    if(f8_exponent > max_exp)
+    {
+        if(clip)
+        {
+            mantissa    = (1 << wm) - 1;
+            f8_exponent = max_exp;
+        }
+        else
+        {
+            return signed_inf;
+        }
+    }
+
+    if(f8_exponent == 0 && mantissa == 0)
+        return negative_zero_nan ? 0 : (sign << 7);
+    mantissa &= (1 << wm) - 1;
+    return (sign << 7) | (f8_exponent << wm) | mantissa;
+}
+
+template <int wm, int we, typename T, bool negative_zero_nan>
+MIGRAPHX_HIP_HOST_DEVICE constexpr T cast_from_f8(uint8_t x)
+{
+    constexpr int weo = 8;
+    constexpr int wmo = 23;
+
+    T fInf, fNegInf, fNaN, fNeg0;
+    uint32_t ifInf    = 0x7F800000;
+    uint32_t ifNegInf = 0xFF800000;
+    uint32_t ifNaN    = 0x7F800001;
+    uint32_t ifNeg0   = 0x80000000;
+    // TODO: need to change T for half but right now it would never  called with half
+    fInf    = migraphx::detail::bit_cast<float>(ifInf);
+    fNegInf = migraphx::detail::bit_cast<float>(ifNegInf);
+    fNaN    = migraphx::detail::bit_cast<float>(ifNaN);
+    fNeg0   = migraphx::detail::bit_cast<float>(ifNeg0);
+
+    if(x == 0)
+        return 0;
+
+    uint32_t sign     = x >> 7;
+    uint32_t mantissa = x & ((1 << wm) - 1);
+    int exponent      = (x & 0x7F) >> wm;
+    if(negative_zero_nan)
+    {
+        if(x == 0x80)
+            return fNaN;
+    }
+    else
+    {
+        if(x == 0x80)
+            return fNeg0;
+        if(exponent == ((1 << we) - 1))
+            return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
+    }
+    typename migraphx::detail::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
+
+    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+
+    // subnormal input
+    if(exponent == 0)
+    {
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __builtin_clz(mantissa) - (32 - wm);
+        mantissa <<= sh;
+        exponent += 1 - sh;
+        mantissa &= ((1 << wm) - 1);
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - wm;
+
+    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;
+        mantissa >>= 1 - exponent;
+        exponent = 0;
+    }
+
+    if(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa;
+    else
+        retval = (sign << 31) | (exponent << 23) | mantissa;
+    return migraphx::detail::bit_cast<T>(retval);
+}
+} // namespace impl
+} // namespace fp8
+} // namespace migraphx
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+#endif // MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_HIP_HPP
 #define MIGRAPHX_GUARD_KERNELS_HIP_HPP

-#ifndef MIGRAPHX_USE_HIPRTC
+#ifndef MIGRAPHX_JIT_USE_HIPRTC
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
 #include <hip/math_functions.h>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -34,6 +34,9 @@ namespace migraphx {

 namespace math {
 constexpr float as_float(migraphx::half x) { return x; }
+
+constexpr float as_float(migraphx::fp8::fp8e4m3fnuz x) { return x; }
+
 template <class T>
 constexpr T as_float(T x)
 {
@@ -57,14 +60,14 @@ constexpr T as_float(T x)
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname)                    \
    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
-    auto __device__ name(type x, Ts... xs)->type                       \
+    auto __device__ name(type x, Ts... xs) -> type                     \
    {                                                                  \
        return fname(x, xs...);                                        \
    }

 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_BINARY_FOR(type, name, fname) \
-    inline auto __device__ name(type x, type y)->type { return fname(x, y); }
+    inline auto __device__ name(type x, type y) -> type { return fname(x, y); }

 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_HALF(name, fname)                         \
@@ -72,6 +75,20 @@ constexpr T as_float(T x)
    auto __device__ name(migraphx::half x, Ts... xs)                   \
        MIGRAPHX_RETURNS(fname(math::as_float(x), math::as_float(xs)...))

+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_FP8(name, fname)                                      \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())>             \
+    auto __device__ name(migraphx::fp8::fp8e4m3fnuz x, Ts... xs) MIGRAPHX_RETURNS( \
+        migraphx::fp8::fp8e4m3fnuz(fname(math::as_float(x), math::as_float(xs)...)))
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_BINARY_FOR_FP8(name, fname)                                    \
+    inline auto __device__ name(migraphx::fp8::fp8e4m3fnuz x, migraphx::fp8::fp8e4m3fnuz y) \
+        -> migraphx::fp8::fp8e4m3fnuz                                                       \
+    {                                                                                       \
+        return migraphx::fp8::fp8e4m3fnuz(fname(math::as_float(x), math::as_float(y)));     \
+    }
+
 // Template with two overloads for math functions, one for half2 type and one for more generic
 // <half, N> vectorization where N is 4 or another even number.

@@ -162,6 +179,33 @@ MIGRAPHX_DEVICE_MATH_HALF(tan, ::tan)
 MIGRAPHX_DEVICE_MATH_HALF(tanh, ::tanh)
 MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)

+// use float to compute fp8 overload
+MIGRAPHX_DEVICE_MATH_FP8(abs, ::abs)
+MIGRAPHX_DEVICE_MATH_FP8(acos, ::acos)
+MIGRAPHX_DEVICE_MATH_FP8(acosh, ::acosh)
+MIGRAPHX_DEVICE_MATH_FP8(asin, ::asin)
+MIGRAPHX_DEVICE_MATH_FP8(asinh, ::asinh)
+MIGRAPHX_DEVICE_MATH_FP8(atan, ::atan)
+MIGRAPHX_DEVICE_MATH_FP8(atanh, ::atanh)
+MIGRAPHX_DEVICE_MATH_FP8(ceil, ::ceil)
+MIGRAPHX_DEVICE_MATH_FP8(cos, ::cos)
+MIGRAPHX_DEVICE_MATH_FP8(cosh, ::cosh)
+MIGRAPHX_DEVICE_MATH_FP8(erf, ::erf)
+MIGRAPHX_DEVICE_MATH_FP8(exp, ::exp)
+MIGRAPHX_DEVICE_MATH_FP8(floor, ::floor)
+MIGRAPHX_DEVICE_MATH_FP8(isnan, ::isnan)
+MIGRAPHX_DEVICE_MATH_FP8(log, ::log)
+MIGRAPHX_DEVICE_MATH_FP8(pow, ::pow)
+MIGRAPHX_DEVICE_MATH_FP8(remainder, ::remainder)
+MIGRAPHX_DEVICE_MATH_FP8(round, ::round)
+MIGRAPHX_DEVICE_MATH_FP8(rsqrt, ::rsqrt)
+MIGRAPHX_DEVICE_MATH_FP8(sin, ::sin)
+MIGRAPHX_DEVICE_MATH_FP8(sinh, ::sinh)
+MIGRAPHX_DEVICE_MATH_FP8(sqrt, ::sqrt)
+MIGRAPHX_DEVICE_MATH_FP8(tan, ::tan)
+MIGRAPHX_DEVICE_MATH_FP8(tanh, ::tanh)
+MIGRAPHX_DEVICE_MATH_FP8(fmod, ::fmod)
+
 // Map math functions to hip half2 functions
 // The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
 // packed into a 32-bit number.  See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
@@ -195,6 +239,9 @@ MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, min, ::min)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, max, ::__hmax)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, min, ::__hmin)

+MIGRAPHX_DEVICE_MATH_BINARY_FOR_FP8(max, ::max)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR_FP8(min, ::min)
+
 template <class T, MIGRAPHX_REQUIRES(not is_any_vec<T>())>
 constexpr auto max(const T& a, const T& b)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -26,6 +26,7 @@

 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/float8.hpp>

 namespace migraphx {

@@ -230,7 +231,8 @@ constexpr unsigned long int_max(unsigned long n)

 template <class T,
          MIGRAPHX_REQUIRES(is_integral<T>{} or is_floating_point<T>{} or
-                            is_same<T, migraphx::half>{})>
+                            is_same<T, migraphx::half>{} or
+                            is_same<T, migraphx::fp8::fp8e4m3fnuz>{})>
 constexpr T numeric_max()
 {
    if constexpr(is_integral<T>{})
@@ -246,6 +248,8 @@ constexpr T numeric_max()
        return __FLT_MAX__;
    else if constexpr(is_same<T, migraphx::half>{})
        return __FLT16_MAX__;
+    else if constexpr(is_same<T, migraphx::fp8::fp8e4m3fnuz>{})
+        return migraphx::fp8::F8_Max<T>();
    else
        return 0;
 }
@@ -260,6 +264,8 @@ constexpr T numeric_lowest()
        else
            return -numeric_max<T>() - 1;
    }
+    else if constexpr(is_same<T, migraphx::fp8::fp8e4m3fnuz>{})
+        return migraphx::fp8::F8_Lowest<T>();
    else
    {
        return -numeric_max<T>();

--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -23,12 +23,11 @@
 */
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
-
 #include <migraphx/kernels/hip.hpp>

 namespace migraphx {

-#if defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS) and defined(MIGRAPHX_USE_HIPRTC)
+#if defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS) and defined(MIGRAPHX_JIT_USE_HIPRTC)
 using int8_t   = signed char;
 using uint8_t  = unsigned char;
 using int16_t  = signed short;
@@ -37,7 +36,7 @@ using int32_t  = signed int;
 using uint32_t = unsigned int;
 using int64_t  = signed long long;
 using uint64_t = unsigned long long;
-#elif defined(MIGRAPHX_USE_HIPRTC)
+#elif defined(MIGRAPHX_JIT_USE_HIPRTC)
 using int8_t   = __hip_int8_t;
 using uint8_t  = __hip_uint8_t;
 using int16_t  = __hip_int16_t;
@@ -55,7 +54,7 @@ using int32_t  = std::int32_t;
 using uint32_t = std::uint32_t;
 using int64_t  = std::int64_t;
 using uint64_t = std::uint64_t;
-#endif // MIGRAPHX_USE_HIPRTC
+#endif // MIGRAPHX_JIT_USE_HIPRTC
 using index_int = uint32_t;
 using diff_int  = int32_t;


--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
 #define MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP

+#include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/tensor_view.hpp>
 #include <migraphx/kernels/vec.hpp>


--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -98,6 +98,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    ctx.set_exhaustive_tune_flag(options.exhaustive_tune);
    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
    unsupported_types.erase(shape::type_t::float_type);
+    unsupported_types.erase(shape::type_t::fp8e4m3fnuz_type);
    unsupported_types.erase(shape::type_t::half_type);
    unsupported_types.erase(shape::type_t::bool_type);
    unsupported_types.erase(shape::type_t::int8_type);

--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -144,7 +144,7 @@ extern "C" {
 __global__ void kernel(${type}* p) 
 {
    auto x = *p;
-    *p = migraphx::implicit_conversion(migraphx::${invoke});
+    *p = implicit_conversion(migraphx::${invoke});

 }
 }
@@ -348,18 +348,18 @@ TEST_CASE(compile_math)
    auto vec_sizes = {2, 4, 6};
    for(auto&& t : migraphx::shape::types())
    {
-        if(contains({migraphx::shape::bool_type,
-                     migraphx::shape::fp8e4m3fnuz_type,
-                     migraphx::shape::tuple_type},
-                    t))
+        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
            continue;
        auto name = migraphx::shape::cpp_type(t);
        if(t == migraphx::shape::half_type)
            name.insert(0, "migraphx::");
        data_types.push_back(name);
-        migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
-            return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
-        });
+        if(t != migraphx::shape::fp8e4m3fnuz_type)
+        {
+            migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
+                return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
+            });
+        }
    }
    migraphx::shape input{migraphx::shape::float_type, {5, 2}};
    migraphx::gpu::hip_compile_options options;
@@ -399,10 +399,7 @@ TEST_CASE(assert_type_min_max)
    migraphx::gpu::hip_compile_options options;
    for(auto&& t : migraphx::shape::types())
    {
-        if(contains({migraphx::shape::bool_type,
-                     migraphx::shape::fp8e4m3fnuz_type,
-                     migraphx::shape::tuple_type},
-                    t))
+        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
            continue;
        auto name = migraphx::shape::cpp_type(t);
        if(t == migraphx::shape::half_type)
@@ -429,7 +426,6 @@ TEST_CASE(assert_type_min_max)
                min = std::to_string(as.min());
                max = std::to_string(as.max());
            }
-
            auto src = migraphx::interpolate_string(assert_template,
                                                    {{"type", name}, {"max", max}, {"min", min}});
            migraphx::shape input{migraphx::shape::float_type, {5, 2}};

--- a/test/verify/test_abs.cpp
+++ b/test/verify/test_abs.cpp
@@ -27,14 +27,19 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>

-struct test_abs : verify_program<test_abs>
+template <migraphx::shape::type_t DType>
+struct test_abs : verify_program<test_abs<DType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
        auto* mm = p.get_main_module();
-        auto x = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {4, 3, 3, 3}});
+        auto x   = mm->add_parameter("x", migraphx::shape{DType, {4, 3, 3, 3}});
        mm->add_instruction(migraphx::make_op("abs"), x);
        return p;
    }
 };
+
+template struct test_abs<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_abs<migraphx::shape::half_type>;
+template struct test_abs<migraphx::shape::float_type>;
--- a/test/verify/test_acos.cpp
+++ b/test/verify/test_acos.cpp
@@ -27,15 +27,20 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>

-struct test_acos : verify_program<test_acos>
+template <migraphx::shape::type_t DType>
+struct test_acos : verify_program<test_acos<DType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
        auto* mm = p.get_main_module();
-        migraphx::shape s{migraphx::shape::float_type, {16}};
+        migraphx::shape s{DType, {16}};
        auto x = mm->add_parameter("x", s);
        mm->add_instruction(migraphx::make_op("acos"), x);
        return p;
    }
 };
+
+template struct test_acos<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_acos<migraphx::shape::half_type>;
+template struct test_acos<migraphx::shape::float_type>;
--- a/test/verify/test_add.cpp
+++ b/test/verify/test_add.cpp
@@ -27,16 +27,21 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>

-struct test_add : verify_program<test_add>
+template <migraphx::shape::type_t DType>
+struct test_add : verify_program<test_add<DType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
        auto* mm = p.get_main_module();
-        migraphx::shape s{migraphx::shape::float_type, {3}};
+        migraphx::shape s{DType, {8}};
        auto x = mm->add_parameter("x", s);
        auto y = mm->add_parameter("y", s);
        mm->add_instruction(migraphx::make_op("add"), x, y);
        return p;
    }
 };
+
+template struct test_add<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_add<migraphx::shape::half_type>;
+template struct test_add<migraphx::shape::float_type>;
--- a/test/verify/test_literal_limits.cpp
+++ b/test/verify/test_literal_limits.cpp
@@ -35,7 +35,11 @@ struct test_literal_limits : verify_program<test_literal_limits<Q, T>>
        migraphx::program p;
        auto* mm          = p.get_main_module();
        auto input_s      = migraphx::shape(Q, {3, 1});
-        auto infinity_val = std::numeric_limits<T>::infinity();
+        auto infinity_val = std::numeric_limits<T>::max();
+        if constexpr(std::numeric_limits<T>::has_infinity)
+        {
+            infinity_val = std::numeric_limits<T>::infinity();
+        }
        std::vector<T> s_data{
            infinity_val, static_cast<T>(-infinity_val), std::numeric_limits<T>::quiet_NaN()};

@@ -52,3 +56,4 @@ template struct test_literal_limits<migraphx::shape::double_type, double>;
 template struct test_literal_limits<migraphx::shape::half_type, migraphx::half>;
 template struct test_literal_limits<migraphx::shape::int32_type, int32_t>;
 template struct test_literal_limits<migraphx::shape::int8_type, int8_t>;
+template struct test_literal_limits<migraphx::shape::fp8e4m3fnuz_type, migraphx::fp8::fp8e4m3fnuz>;