changes for the FP8 ref implementation

df7f8a35 · Umang Yadav · 35e5298e · df7f8a35 · df7f8a35 · df7f8a35
Commit df7f8a35 authored Nov 09, 2023 by Umang Yadav
13 changed files
--- a/src/api/include/migraphx/migraphx.h
+++ b/src/api/include/migraphx/migraphx.h
@@ -44,7 +44,8 @@
    m(int32_type, int32_t) \
    m(int64_type, int64_t) \
    m(uint32_type, uint32_t) \
-    m(uint64_type, uint64_t)
+    m(uint64_type, uint64_t) \
+    m(fp8e4m3fnuz_type, migraphx_fp8::fp8e4m3fnuz)
 // clang-format on

 #ifdef __cplusplus

--- a/src/include/migraphx/half.hpp
+++ b/src/include/migraphx/half.hpp
@@ -27,6 +27,7 @@

 #include <half/half.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/migraphx_float8.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -67,6 +68,18 @@ struct common_type<T, migraphx::half> : std::common_type<float, T> // NOLINT
 {
 };

+template <>
+struct common_type<migraphx_fp8::fp8e4m3fnuz, migraphx::half>
+{
+    using type = float;
+};
+
+template <>
+struct common_type<migraphx::half, migraphx_fp8::fp8e4m3fnuz>
+{
+    using type = float;
+};
+
 template <>
 struct common_type<migraphx::half, migraphx::half>
 {

--- a/src/include/migraphx/migraphx_f8_impl.hpp
+++ b/src/include/migraphx/migraphx_f8_impl.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef MIGRAPHX_FP8_IMPL_HPP
+#define MIGRAPHX_FP8_IMPL_HPP
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+
+#define CONST_FOLD(x) (__builtin_constant_p(x) ? (x) : (x))
+namespace migraphx_f8_impl {
+namespace detail {
+template <bool B, class T, class F>
+struct conditional
+{
+    using type = T;
+};
+
+template <class T, class F>
+struct conditional<false, T, F>
+{
+    using type = F;
+};
+
+template <typename To, typename From>
+inline constexpr To bit_cast(From fr) noexcept
+{
+    static_assert(sizeof(To) == sizeof(From));
+#if defined(__GNUC__) and !defined(__clang__)
+    To x = CONST_FOLD(*reinterpret_cast<To*>(&fr));
+#else
+    To x = __builtin_bit_cast(To, fr);
+#endif
+    return x;
+}
+} // namespace detail
+
+template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
+constexpr uint8_t cast_to_f8(T _x, bool stoch, uint32_t rng)
+{
+
+    static_assert(wm + we == 7, "wm+we==7");
+
+    const int mfmt = (sizeof(T) == 4) ? 23 : 10;
+    typename detail::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type x;
+
+    if constexpr(sizeof(T) == 4)
+        x = detail::bit_cast<uint32_t>(_x);
+    else
+        x = detail::bit_cast<uint16_t>(_x);
+
+    uint32_t head, mantissa;
+    int exponent, bias;
+    uint32_t sign;
+
+    if constexpr(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+    }
+
+    uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
+
+    // Deal with inf and NaNs
+    if(negative_zero_nan)
+    {
+        if(sizeof(T) == 4)
+        {
+            if((x & 0x7F800000) == 0x7F800000)
+                return 0x80;
+        }
+        else
+        {
+            // if(__hisinf(x) || __hisnan(x))
+            if((x & 0x7C00) == 0x7C00)
+                return 0x80;
+        }
+    }
+    else
+    {
+        if(sizeof(T) == 4)
+        {
+            if((x & 0x7F800000) == 0x7F800000)
+                return signed_inf + (mantissa != 0 ? 1 : 0);
+        }
+        else
+        {
+            if((x & 0x7C00) == 0x7C00)
+                return signed_inf + (mantissa != 0 ? 1 : 0);
+        }
+    }
+    // handle positive zero
+    if(x == 0)
+        return 0;
+    // handle negative zero
+    if((sizeof(T) == 4 and x == 0x80000000) or (sizeof(T) == 2 and x == 0x8000))
+    {
+        if(negative_zero_nan)
+        {
+            return 0;
+        }
+        else
+        {
+            return 0x80;
+        }
+    }
+
+    // First need to check if it is normal or denorm as there is a difference of implict 1
+    // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
+    // The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
+    // RNE, no need to add rng. Then probably need to check whether there is carry and adjust
+    // exponent and mantissa again
+
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
+    const int f8_bias                  = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    // f8_exponent is the converted f8 exponent with bias encoding
+    // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    // the difference needs to be adjusted and mantissa shifted
+    int act_exponent, f8_exponent, exponent_diff;
+
+    if(exponent == 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
+here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal has
+exponent bias 15 while bf8 with NANOO has exponent bias 16. It means that there are some numbers in
+fp16 denormal but they are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
+where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 (NANOO) normal. In
+this case, the fp16 mantissa should be shift left by 1  */
+        act_exponent  = exponent - bias + 1;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
+   For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
+   actual exponent is -7, it is actually larger due to the implict 1,
+   Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+   So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        {          // both fp32/fp16 and f8 are in normal range
+            exponent_diff =
+                0; // exponent_diff=0 does not mean there is no difference for this case,
+            // act_exponent could be larger. Just that it does not need shift mantissa
+        }
+        mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
+    }
+
+    bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) ==
+                    (1 << (mfmt - wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we
+ shift right as shift right could rip off some residual part and make something not midpoint look
+ like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than
+ midpoint, but after shift right by 4 bits, it would look like midpoint.
+*/
+
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1 << mfmt);
+    // if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+
+    // Now we have the exponent and mantissa adjusted
+    uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
+    bool odd =
+        mantissa & (1 << (mfmt - wm)); // if the least significant bit that is not truncated is 1
+    mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
+
+    // Now we deal with overflow
+    if(f8_exponent == 0)
+    {
+        if((1 << mfmt) & mantissa)
+        {
+            f8_exponent = 1; // denormal overflow to become normal, promote exponent
+        }
+    }
+    else
+    {
+        if((1 << (mfmt + 1)) & mantissa)
+        {
+            mantissa >>= 1;
+            f8_exponent++;
+        }
+    }
+
+    mantissa >>= (mfmt - wm);
+
+    // above range: quantize to maximum possible float of the same sign
+    const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
+    if(f8_exponent > max_exp)
+    {
+        if(clip)
+        {
+            mantissa    = (1 << wm) - 1;
+            f8_exponent = max_exp;
+        }
+        else
+        {
+            return signed_inf;
+        }
+    }
+
+    if(f8_exponent == 0 && mantissa == 0)
+        return negative_zero_nan ? 0 : (sign << 7);
+    mantissa &= (1 << wm) - 1;
+    return (sign << 7) | (f8_exponent << wm) | mantissa;
+}
+
+template <int wm, int we, typename T, bool negative_zero_nan>
+constexpr T cast_from_f8(uint8_t x)
+{
+    constexpr int weo = 8;
+    constexpr int wmo = 23;
+
+    T fInf, fNegInf, fNaN, fNeg0;
+    uint32_t ifInf    = 0x7F800000;
+    uint32_t ifNegInf = 0xFF800000;
+    uint32_t ifNaN    = 0x7F800001;
+    uint32_t ifNeg0   = 0x80000000;
+    // TODO: need to change T for half but right now it would never  called with half
+    fInf    = detail::bit_cast<float>(ifInf);
+    fNegInf = detail::bit_cast<float>(ifNegInf);
+    fNaN    = detail::bit_cast<float>(ifNaN);
+    fNeg0   = detail::bit_cast<float>(ifNeg0);
+
+    if(x == 0)
+        return 0;
+
+    uint32_t sign     = x >> 7;
+    uint32_t mantissa = x & ((1 << wm) - 1);
+    int exponent      = (x & 0x7F) >> wm;
+    if(negative_zero_nan)
+    {
+        if(x == 0x80)
+            return fNaN;
+    }
+    else
+    {
+        if(x == 0x80)
+            return fNeg0;
+        if(exponent == ((1 << we) - 1))
+            return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
+    }
+    typename detail::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
+
+    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+
+    // subnormal input
+    if(exponent == 0)
+    {
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __builtin_clz(mantissa) - (32 - wm);
+        mantissa <<= sh;
+        exponent += 1 - sh;
+        mantissa &= ((1 << wm) - 1);
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - wm;
+
+    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;
+        mantissa >>= 1 - exponent;
+        exponent = 0;
+    }
+
+    if(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa;
+    else
+        retval = (sign << 31) | (exponent << 23) | mantissa;
+    return detail::bit_cast<T>(retval);
+}
+
+} // namespace migraphx_f8_impl
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+#endif // MIGRAPHX_FP8_IMPL_HPP
--- a/src/include/migraphx/migraphx_float8.hpp
+++ b/src/include/migraphx/migraphx_float8.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+
+#ifndef MIGRAPHX_GUARD_RTGLIB_FLOAT8_HPP
+#define MIGRAPHX_GUARD_RTGLIB_FLOAT8_HPP
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wfloat-equal"
+#pragma clang diagnostic ignored "-Wmacro-redefined"
+#pragma clang diagnostic ignored "-Wc++20-extensions"
+#endif // __clang__
+
+#ifndef MIGRAPHX_FP8_FNUZ
+#define MIGRAPHX_FP8_FNUZ true
+#endif // MIGRAPHX_FP8_FNUZ
+
+// We are clipping in down conversion by default
+#define MIGRAPHX_F8_DOWNCAST_CLIPPING 1
+#include <cmath>
+#include <cstdint>
+#include <climits>
+#include <cstring>
+#include <iosfwd>
+#include <limits>
+#include <sstream>
+#include <iostream>
+#include <string>
+#include <utility>
+
+namespace migraphx_f8_impl {
+
+template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
+constexpr uint8_t cast_to_f8(T _x, bool stoch = false, uint32_t rng = 0);
+
+template <int wm, int we, typename T, bool negative_zero_nan>
+constexpr T cast_from_f8(uint8_t x);
+
+} // namespace migraphx_f8_impl
+
+#include <migraphx/migraphx_f8_impl.hpp>
+
+namespace migraphx_fp8 {
+
+enum class migraphx_f8_rounding_mode
+{
+    standard, // standard rounding is doing RNE -- round to nearest even
+    stochastic
+};
+
+enum class f8_type
+{
+    bf8 = 0, // s1e5m2
+    fp8 = 1  // s1e4m3
+};
+
+template <typename T>
+class numeric_limits;
+
+template <migraphx_fp8::f8_type T = migraphx_fp8::f8_type::fp8>
+struct float8
+{
+    uint8_t data;
+    // default constructor
+    constexpr float8() = default;
+    // default copy constructor
+    constexpr float8(const float8<T>& y) = default;
+    struct from_bits_t
+    {
+    };
+    static constexpr from_bits_t from_bits() { return from_bits_t(); }
+
+    explicit constexpr float8(uint8_t bits, from_bits_t) : data(bits) {}
+
+    explicit constexpr float8(float v,
+                              migraphx_fp8::migraphx_f8_rounding_mode rm =
+                                  migraphx_fp8::migraphx_f8_rounding_mode::standard,
+                              uint32_t rng = 0)
+    {
+        if constexpr(T == migraphx_fp8::f8_type::fp8)
+        {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx_f8_impl::
+                cast_to_f8<3, 4, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                    v, (rm == migraphx_fp8::migraphx_f8_rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx_f8_impl::
+                cast_to_f8<3, 4, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                    v, (rm == migraphx_fp8::migraphx_f8_rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_F8_DOWNCAST_CLIPPING
+        }
+        else
+        {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx_f8_impl::
+                cast_to_f8<2, 5, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                    v, (rm == migraphx_fp8::migraphx_f8_rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx_f8_impl::
+                cast_to_f8<2, 5, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                    v, (rm == migraphx_fp8::migraphx_f8_rounding_mode::stochastic), rng);
+#endif // rocblas_F8_downcast_clipping}
+        }
+    }
+
+    inline constexpr operator float() const
+    {
+        if constexpr(T == migraphx_fp8::f8_type::fp8)
+        {
+            return migraphx_f8_impl::
+                cast_from_f8<3, 4, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/>(data);
+        } // else
+        return migraphx_f8_impl::cast_from_f8<2, 5, float, MIGRAPHX_FP8_FNUZ /*negative_zero_nan*/>(
+            data);
+    }
+
+    inline constexpr bool is_zero() const
+    {
+        if constexpr(MIGRAPHX_FP8_FNUZ)
+        {
+            return data == 0x00;
+        }
+        else
+        {
+            return (data == 0x00) || (data == 0x80);
+        }
+    }
+
+    inline constexpr bool is_nan() const
+    {
+        if constexpr(MIGRAPHX_FP8_FNUZ)
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == migraphx_fp8::f8_type::bf8)
+            {
+                return (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xfd) ||
+                       (data == 0xfe) || (data == 0xff);
+            }
+            else
+            {
+                return (data == 0x79) || (data == 0x7a) || (data == 0x7b) || (data == 0x7c) ||
+                       (data == 0x7d) || (data == 0x7e) || (data == 0x7f) || (data == 0xf9) ||
+                       (data == 0xfa) || (data == 0xfb) || (data == 0xfc) || (data == 0xfd) ||
+                       (data == 0xfe) || (data == 0xff);
+            }
+        }
+    }
+
+    inline constexpr bool is_inf() const
+    {
+        if constexpr(MIGRAPHX_FP8_FNUZ)
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == migraphx_fp8::f8_type::bf8)
+            {
+                return (data == 0x7c) || (data == 0xfc);
+            }
+            else
+            {
+                return (data == 0x78) || (data == 0xf8);
+            }
+        }
+    }
+
+#define MIGRAPHX_FP8_UNARY_OP(unary_op, binary_op)                                    \
+    constexpr float8& operator unary_op(const float8& rhs)                            \
+    {                                                                                 \
+        const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
+        *this          = static_cast<float8>(tmp);                                    \
+        return *this;                                                                 \
+    }                                                                                 \
+    constexpr float8& operator unary_op(const float& rhs)                             \
+    {                                                                                 \
+        const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
+        *this          = static_cast<float8>(tmp);                                    \
+        return *this;                                                                 \
+    }
+
+    MIGRAPHX_FP8_UNARY_OP(*=, *)
+    MIGRAPHX_FP8_UNARY_OP(-=, -)
+    MIGRAPHX_FP8_UNARY_OP(+=, +)
+    MIGRAPHX_FP8_UNARY_OP(/=, /)
+
+    inline constexpr float8& operator=(const float8& rhs) = default;
+    inline constexpr float8& operator=(float8&& rhs)      = default;
+
+    inline constexpr float8& operator=(float rhs)
+    {
+        *this = static_cast<float8>(rhs);
+        return *this;
+    }
+
+    inline constexpr bool operator==(const float8& rhs) const
+    {
+        if((rhs.is_zero() && this->is_zero()) ||
+           (fabs(rhs - *this) < migraphx_fp8::numeric_limits<float8<T>>::epsilon()))
+            return true;
+        else if(rhs.is_nan() || rhs.is_inf() || this->is_nan() || this->is_inf())
+            return false;
+
+        return false;
+    }
+
+    inline constexpr bool operator<(const float8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we < them;
+    }
+
+    inline constexpr bool operator>(const float8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we > them;
+    }
+};
+
+// Special operator overloading
+template <migraphx_fp8::f8_type T>
+inline std::ostream& operator<<(std::ostream& os, const migraphx_fp8::float8<T>& rhs)
+{
+    return os << static_cast<float>(rhs);
+}
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_BINARY_OP(binary_op, U)                                  \
+    template <migraphx_fp8::f8_type T>                                        \
+    inline constexpr U operator binary_op(const migraphx_fp8::float8<T>& lhs, \
+                                          const migraphx_fp8::float8<T>& rhs) \
+    {                                                                         \
+        return U(static_cast<float>(lhs) binary_op static_cast<float>(rhs));  \
+    }
+
+// TODO: these should return floats
+MIGRAPHX_FP8_BINARY_OP(*, migraphx_fp8::float8<T>)
+MIGRAPHX_FP8_BINARY_OP(-, migraphx_fp8::float8<T>)
+MIGRAPHX_FP8_BINARY_OP(/, migraphx_fp8::float8<T>)
+MIGRAPHX_FP8_BINARY_OP(+, migraphx_fp8::float8<T>)
+// TODO: Comparison ops shouldn't convert to float, maybe need to take care of rounding effects.
+MIGRAPHX_FP8_BINARY_OP(==, bool)
+MIGRAPHX_FP8_BINARY_OP(>=, bool)
+MIGRAPHX_FP8_BINARY_OP(<=, bool)
+MIGRAPHX_FP8_BINARY_OP(>, bool)
+MIGRAPHX_FP8_BINARY_OP(<, bool)
+MIGRAPHX_FP8_BINARY_OP(!=, bool)
+
+template <migraphx_fp8::f8_type T>
+inline migraphx_fp8::float8<T> fabs(migraphx_fp8::float8<T> v)
+{
+    v.data = v.data & 0x7f;
+    return v;
+}
+
+template <class T>
+constexpr T F8_Max()
+{
+    return T{0x7F, T::from_bits()};
+}
+
+template <class T>
+constexpr T F8_Lowest()
+{
+    return T{0xFF, T::from_bits()};
+}
+
+using fp8e4m3fnuz = float8<migraphx_fp8::f8_type::fp8>;
+
+template <>
+class numeric_limits<migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>>
+{
+    public:
+    // TODO :figure out epsilon in Hex to make it constexpr
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::fp8> epsilon()
+    {
+        return migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>(
+            0x28, migraphx_fp8::float8<>::from_bits());
+    }
+
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::fp8> quiet_NaN()
+    {
+        return migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>(
+            MIGRAPHX_FP8_FNUZ ? 0x80 : 0x7F, migraphx_fp8::float8<>::from_bits());
+    }
+
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::fp8> max()
+    {
+        return migraphx_fp8::F8_Max<migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>>();
+    }
+
+    // TODO figure out Hex value
+    static migraphx_fp8::float8<migraphx_fp8::f8_type::fp8> min()
+    {
+        return static_cast<migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>>(-1.0f) *
+               migraphx_fp8::F8_Max<migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>>();
+    }
+
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::fp8> lowest()
+    {
+        return migraphx_fp8::F8_Lowest<migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>>();
+    }
+
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::fp8> infinity()
+    {
+        return migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>(
+            MIGRAPHX_FP8_FNUZ ? 0x80 : 0x7F, migraphx_fp8::float8<>::from_bits());
+    }
+};
+
+template <>
+class numeric_limits<migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>>
+{
+    public:
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::bf8> epsilon()
+    {
+        return migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>(
+            0x34, migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>::from_bits());
+    }
+
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::bf8> quiet_NaN()
+    {
+        return migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>(
+            MIGRAPHX_FP8_FNUZ ? 0x80 : 0x7d,
+            migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>::from_bits());
+    }
+
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::bf8> max()
+    {
+        return static_cast<migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>>(
+            migraphx_fp8::F8_Max<migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>>());
+    }
+    // TODO figure  out constexpr value
+    static migraphx_fp8::float8<migraphx_fp8::f8_type::bf8> min()
+    {
+        return static_cast<migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>>(float(-1.0f)) *
+               migraphx_fp8::F8_Max<migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>>();
+    }
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::bf8> lowest()
+    {
+        return migraphx_fp8::F8_Lowest<migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>>();
+    }
+
+    static constexpr migraphx_fp8::float8<migraphx_fp8::f8_type::bf8> infinity()
+    {
+        return migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>(
+            MIGRAPHX_FP8_FNUZ ? 0x80 : 0x7c,
+            migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>::from_bits());
+    }
+};
+} // namespace migraphx_fp8
+// define numeric limits for the new data type
+namespace std {
+inline bool isfinite(migraphx_fp8::float8<migraphx_fp8::f8_type::fp8> x) // NOLINT
+{
+    return x.is_inf();
+}
+
+inline bool isfinite(migraphx_fp8::float8<migraphx_fp8::f8_type::bf8> x) // NOLINT
+{
+    return x.is_inf();
+}
+
+inline bool isnan(migraphx_fp8::float8<migraphx_fp8::f8_type::fp8> x) // NOLINT
+{
+    return x.is_nan();
+}
+
+inline bool isnan(migraphx_fp8::float8<migraphx_fp8::f8_type::bf8> x) // NOLINT
+{
+    return x.is_nan();
+}
+
+template <>
+class numeric_limits<migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>>
+    : public migraphx_fp8::numeric_limits<migraphx_fp8::float8<migraphx_fp8::f8_type::fp8>>
+{
+};
+
+template <>
+class numeric_limits<migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>>
+    : public migraphx_fp8::numeric_limits<migraphx_fp8::float8<migraphx_fp8::f8_type::bf8>>
+{
+};
+
+template <class T>
+struct common_type<migraphx_fp8::fp8e4m3fnuz, T> : std::common_type<float, T> // NOLINT
+{
+};
+
+template <class T>
+struct common_type<T, migraphx_fp8::fp8e4m3fnuz> : std::common_type<float, T> // NOLINT
+{
+};
+
+template <>
+struct common_type<migraphx_fp8::fp8e4m3fnuz, migraphx_fp8::fp8e4m3fnuz>
+{
+    using type = float;
+};
+
+} // namespace std
+// =================================================================================================
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+#endif // MIGRAPHX_GUARD_RTGLIB_FLOAT8_HPP
--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -34,6 +34,7 @@
 #include <migraphx/functional.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/half.hpp>
+#include <migraphx/migraphx_float8.hpp>
 #include <migraphx/serialize.hpp>
 #include <migraphx/config.hpp>

@@ -60,7 +61,8 @@ struct MIGRAPHX_EXPORT shape
    m(int32_type, int32_t) \
    m(int64_type, int64_t) \
    m(uint32_type, uint32_t) \
-    m(uint64_type, uint64_t)
+    m(uint64_type, uint64_t) \
+    m(fp8e4m3fnuz_type, migraphx_fp8::fp8e4m3fnuz)
    // clang-format on

 #define MIGRAPHX_SHAPE_GENERATE_ENUM_TYPES(x, t) x,

--- a/src/include/migraphx/type_traits.hpp
+++ b/src/include/migraphx/type_traits.hpp
@@ -28,25 +28,45 @@
 #include <type_traits>
 #include <migraphx/half.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/migraphx_float8.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+#define MIGRAPHX_DETAIL_DEFINE_TRAIT(trait) \
+    template <class X>                      \
+    struct trait : std::trait<X>            \
+    {                                       \
+    };
+
 #define MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(trait, T) \
-    template <class X>                             \
-    struct trait : std::trait<X>                   \
-    {                                              \
-    };                                             \
-                                                   \
    template <>                                    \
    struct trait<T> : std::true_type               \
    {                                              \
    };

+MIGRAPHX_DETAIL_DEFINE_TRAIT(is_floating_point);
+MIGRAPHX_DETAIL_DEFINE_TRAIT(is_arithmetic);
+MIGRAPHX_DETAIL_DEFINE_TRAIT(is_signed);
+
+template <class T, class U>
+struct is_same : std::is_same<T, U>
+{
+};
+
+template <bool B, class T, class U>
+struct conditional : std::conditional<B, T, U>
+{
+};
+
 MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, half)
 MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, half)
 MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, half)

+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, migraphx_fp8::fp8e4m3fnuz)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, migraphx_fp8::fp8e4m3fnuz)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, migraphx_fp8::fp8e4m3fnuz)
+
 template <class T>
 using accumulator_type =
    std::conditional_t<is_floating_point<T>{},

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -40,7 +40,7 @@
 #include <migraphx/json.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/op/common.hpp>
-
+#include <migraphx/migraphx_float8.hpp>
 #ifdef HAVE_GPU
 #include <migraphx/gpu/hip.hpp>
 #endif
@@ -144,6 +144,17 @@ struct npy_format_descriptor<half>
    static constexpr auto name() { return _("half"); }
 };

+template <>
+struct npy_format_descriptor<migraphx_fp8::fp8e4m3fnuz>
+{
+    static std::string format()
+    {
+        // following: https://docs.python.org/3/library/struct.html#format-characters
+        return "z";
+    }
+    static constexpr auto name() { return _("fp8e4m3fnuz"); }
+};
+
 } // namespace detail
 } // namespace pybind11


--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -146,20 +146,20 @@ __device__ __host__ T to_hip_type(T x)
 // Hip doens't support __fp16
 inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }

-#define MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(trait, T) \
-    template <class X>                             \
-    struct trait : std::trait<X>                   \
-    {                                              \
-    };                                             \
-                                                   \
-    template <>                                    \
-    struct trait<T> : std::true_type               \
-    {                                              \
+#define MIGRAPHX_DEVICE_DETAIL_EXTEND_TRAIT_FOR(trait, T) \
+    template <class X>                                    \
+    struct trait : std::trait<X>                          \
+    {                                                     \
+    };                                                    \
+                                                          \
+    template <>                                           \
+    struct trait<T> : std::true_type                      \
+    {                                                     \
    };

-MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, __fp16)
-MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, __fp16)
-MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, __fp16)
+MIGRAPHX_DEVICE_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, __fp16)
+MIGRAPHX_DEVICE_DETAIL_EXTEND_TRAIT_FOR(is_signed, __fp16)
+MIGRAPHX_DEVICE_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, __fp16)

 } // namespace device
 } // namespace gpu

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -150,6 +150,7 @@ function(test_headers PREFIX)
        list(REMOVE_ITEM HEADERS
              ${CMAKE_SOURCE_DIR}/src/targets/gpu/include/migraphx/gpu/ck.hpp)
    endif()
+    list(REMOVE_ITEM HEADERS ${CMAKE_SOURCE_DIR}/src/include/migraphx/migraphx_f8_impl.hpp)
    foreach(HEADER ${HEADERS})
        file(RELATIVE_PATH HEADER_REL ${CMAKE_SOURCE_DIR} ${HEADER})
        string(MAKE_C_IDENTIFIER ${HEADER_REL} TEST_NAME)

--- a/test/float_equal.cpp
+++ b/test/float_equal.cpp
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/float_equal.hpp>
+#include <migraphx/migraphx_float8.hpp>
 #include <migraphx/half.hpp>
 #include "test.hpp"

@@ -53,7 +54,7 @@ auto test_float_equal(T x, U y)
 template <class T, class U>
 void test_equality()
 {
-    auto x1 = T(0.1);
+    auto x1 = T(0.125);
    auto x2 = U(0.0);
    auto x3 = U(1.0);
    EXPECT(test_float_equal(x1, x1));
@@ -71,8 +72,12 @@ void test_equality()
 TEST_CASE_REGISTER(test_equality<double, float>);
 TEST_CASE_REGISTER(test_equality<double, int>);
 TEST_CASE_REGISTER(test_equality<double, migraphx::half>);
+TEST_CASE_REGISTER(test_equality<double, migraphx_fp8::fp8e4m3fnuz>);
 TEST_CASE_REGISTER(test_equality<float, int>);
+TEST_CASE_REGISTER(test_equality<float, migraphx_fp8::fp8e4m3fnuz>);
 TEST_CASE_REGISTER(test_equality<migraphx::half, int>);
+TEST_CASE_REGISTER(test_equality<migraphx::half, migraphx_fp8::fp8e4m3fnuz>);
+TEST_CASE_REGISTER(test_equality<migraphx_fp8::fp8e4m3fnuz, int>);

 template <class T, class U>
 void test_limits()
@@ -110,8 +115,13 @@ void test_limits()
 TEST_CASE_REGISTER(test_limits<double, float>);
 TEST_CASE_REGISTER(test_limits<double, int>);
 TEST_CASE_REGISTER(test_limits<double, migraphx::half>);
+TEST_CASE_REGISTER(test_limits<double, migraphx_fp8::fp8e4m3fnuz>);
 TEST_CASE_REGISTER(test_limits<float, int>);
+TEST_CASE_REGISTER(test_limits<float, migraphx_fp8::fp8e4m3fnuz>);
 TEST_CASE_REGISTER(test_limits<int, migraphx::half>);
+TEST_CASE_REGISTER(test_limits<int, migraphx_fp8::fp8e4m3fnuz>);
+TEST_CASE_REGISTER(test_limits<migraphx_fp8::fp8e4m3fnuz, migraphx::half>);
+
 #ifndef _WIN32
 // On Windows, types int and long have the same min and max values.
 TEST_CASE_REGISTER(test_limits<long, int>);

--- a/test/fp8e4m3fnuz.cpp
+++ b/test/fp8e4m3fnuz.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <cmath>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/migraphx_float8.hpp>
+#include <migraphx/half.hpp>
+#include <migraphx/ranges.hpp>
+#include "test.hpp"
+
+#include <limits>
+
+float fp8e4m3fnuz_to_fp32_value(uint8_t input)
+{
+    constexpr std::array<float, 256> e4m3fnuz_lut = {
+        0.0f,           0.0009765625f,  0.001953125f,
+        0.0029296875f,  0.00390625f,    0.0048828125f,
+        0.005859375f,   0.0068359375f,  0.0078125f,
+        0.0087890625f,  0.009765625f,   0.0107421875f,
+        0.01171875f,    0.0126953125f,  0.013671875f,
+        0.0146484375f,  0.015625f,      0.017578125f,
+        0.01953125f,    0.021484375f,   0.0234375f,
+        0.025390625f,   0.02734375f,    0.029296875f,
+        0.03125f,       0.03515625f,    0.0390625f,
+        0.04296875f,    0.046875f,      0.05078125f,
+        0.0546875f,     0.05859375f,    0.0625f,
+        0.0703125f,     0.078125f,      0.0859375f,
+        0.09375f,       0.1015625f,     0.109375f,
+        0.1171875f,     0.125f,         0.140625f,
+        0.15625f,       0.171875f,      0.1875f,
+        0.203125f,      0.21875f,       0.234375f,
+        0.25f,          0.28125f,       0.3125f,
+        0.34375f,       0.375f,         0.40625f,
+        0.4375f,        0.46875f,       0.5f,
+        0.5625f,        0.625f,         0.6875f,
+        0.75f,          0.8125f,        0.875f,
+        0.9375f,        1.0f,           1.125f,
+        1.25f,          1.375f,         1.5f,
+        1.625f,         1.75f,          1.875f,
+        2.0f,           2.25f,          2.5f,
+        2.75f,          3.0f,           3.25f,
+        3.5f,           3.75f,          4.0f,
+        4.5f,           5.0f,           5.5f,
+        6.0f,           6.5f,           7.0f,
+        7.5f,           8.0f,           9.0f,
+        10.0f,          11.0f,          12.0f,
+        13.0f,          14.0f,          15.0f,
+        16.0f,          18.0f,          20.0f,
+        22.0f,          24.0f,          26.0f,
+        28.0f,          30.0f,          32.0f,
+        36.0f,          40.0f,          44.0f,
+        48.0f,          52.0f,          56.0f,
+        60.0f,          64.0f,          72.0f,
+        80.0f,          88.0f,          96.0f,
+        104.0f,         112.0f,         120.0f,
+        128.0f,         144.0f,         160.0f,
+        176.0f,         192.0f,         208.0f,
+        224.0f,         240.0f,         std::numeric_limits<float>::quiet_NaN(),
+        -0.0009765625f, -0.001953125f,  -0.0029296875f,
+        -0.00390625f,   -0.0048828125f, -0.005859375f,
+        -0.0068359375f, -0.0078125f,    -0.0087890625f,
+        -0.009765625f,  -0.0107421875f, -0.01171875f,
+        -0.0126953125f, -0.013671875f,  -0.0146484375f,
+        -0.015625f,     -0.017578125f,  -0.01953125f,
+        -0.021484375f,  -0.0234375f,    -0.025390625f,
+        -0.02734375f,   -0.029296875f,  -0.03125f,
+        -0.03515625f,   -0.0390625f,    -0.04296875f,
+        -0.046875f,     -0.05078125f,   -0.0546875f,
+        -0.05859375f,   -0.0625f,       -0.0703125f,
+        -0.078125f,     -0.0859375f,    -0.09375f,
+        -0.1015625f,    -0.109375f,     -0.1171875f,
+        -0.125f,        -0.140625f,     -0.15625f,
+        -0.171875f,     -0.1875f,       -0.203125f,
+        -0.21875f,      -0.234375f,     -0.25f,
+        -0.28125f,      -0.3125f,       -0.34375f,
+        -0.375f,        -0.40625f,      -0.4375f,
+        -0.46875f,      -0.5f,          -0.5625f,
+        -0.625f,        -0.6875f,       -0.75f,
+        -0.8125f,       -0.875f,        -0.9375f,
+        -1.0f,          -1.125f,        -1.25f,
+        -1.375f,        -1.5f,          -1.625f,
+        -1.75f,         -1.875f,        -2.0f,
+        -2.25f,         -2.5f,          -2.75f,
+        -3.0f,          -3.25f,         -3.5f,
+        -3.75f,         -4.0f,          -4.5f,
+        -5.0f,          -5.5f,          -6.0f,
+        -6.5f,          -7.0f,          -7.5f,
+        -8.0f,          -9.0f,          -10.0f,
+        -11.0f,         -12.0f,         -13.0f,
+        -14.0f,         -15.0f,         -16.0f,
+        -18.0f,         -20.0f,         -22.0f,
+        -24.0f,         -26.0f,         -28.0f,
+        -30.0f,         -32.0f,         -36.0f,
+        -40.0f,         -44.0f,         -48.0f,
+        -52.0f,         -56.0f,         -60.0f,
+        -64.0f,         -72.0f,         -80.0f,
+        -88.0f,         -96.0f,         -104.0f,
+        -112.0f,        -120.0f,        -128.0f,
+        -144.0f,        -160.0f,        -176.0f,
+        -192.0f,        -208.0f,        -224.0f,
+        -240.0f,
+    };
+
+    return e4m3fnuz_lut[input];
+}
+
+TEST_CASE(test_fp8_cast_to_float)
+{
+    std::vector<uint8_t> bit_vals(256);
+    std::iota(bit_vals.begin(), bit_vals.end(), 0);
+    EXPECT(bool{std::all_of(bit_vals.begin(), bit_vals.end(), [](uint8_t bit_val) {
+        migraphx_fp8::fp8e4m3fnuz fp8_val(bit_val, migraphx_fp8::fp8e4m3fnuz::from_bits());
+        if(std::isnan(float(fp8_val)) and std::isnan(fp8e4m3fnuz_to_fp32_value(bit_val)))
+        {
+            return true;
+        }
+        return migraphx::float_equal(float(fp8_val), fp8e4m3fnuz_to_fp32_value(bit_val));
+    })});
+}
+
+TEST_CASE(test_positive_zero)
+{
+    float zero = 0.0;
+    migraphx_fp8::fp8e4m3fnuz fp8_zero(zero);
+    EXPECT(fp8_zero.is_zero());
+    EXPECT(migraphx::float_equal(zero, float(fp8_zero)));
+}
+
+TEST_CASE(test_negative_zero)
+{
+    float nzero = -0.0;
+    float pzero = 0.0;
+    migraphx_fp8::fp8e4m3fnuz fp8_nzero(nzero);
+    EXPECT(fp8_nzero.is_zero());
+    //  negative zero gets converted to positive zero
+    EXPECT(migraphx::float_equal(pzero, float(fp8_nzero)));
+}
+
+TEST_CASE(test_nan_1)
+{
+    float fnan = std::numeric_limits<float>::quiet_NaN();
+    migraphx_fp8::fp8e4m3fnuz fp8_nan(fnan);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(fp8_nan));
+}
+
+TEST_CASE(test_nan_2)
+{
+    auto fnan = std::numeric_limits<migraphx_fp8::fp8e4m3fnuz>::quiet_NaN();
+    migraphx_fp8::fp8e4m3fnuz fp8_nan(fnan.data, migraphx_fp8::fp8e4m3fnuz::from_bits());
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(fp8_nan));
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_infinity_1)
+{
+    float finf = std::numeric_limits<float>::infinity();
+    // no inf in fp8e4m3fnuz
+    migraphx_fp8::fp8e4m3fnuz fp8_nan(finf);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_infinity_2)
+{
+    // no inf in fp8e4m3fnuz, it gets converted to NaNs
+    migraphx_fp8::fp8e4m3fnuz fp8_nan(std::numeric_limits<migraphx_fp8::fp8e4m3fnuz>::infinity());
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_infinity_3)
+{
+    // neg inf
+    float finf = -1.0 * std::numeric_limits<float>::infinity();
+    // no inf in fp8e4m3fnuz
+    migraphx_fp8::fp8e4m3fnuz fp8_nan(finf);
+    EXPECT(fp8_nan.is_nan());
+    EXPECT(std::isnan(float(fp8_nan)));
+}
+
+TEST_CASE(test_numeric_max_1)
+{
+    float fmax = std::numeric_limits<float>::max();
+    migraphx_fp8::fp8e4m3fnuz fp8_max(fmax);
+    EXPECT(fp8_max == std::numeric_limits<migraphx_fp8::fp8e4m3fnuz>::max());
+}
+
+TEST_CASE(test_numeric_max_2)
+{
+    // gets clipped to max
+    float fmax = 2 * std::numeric_limits<migraphx_fp8::fp8e4m3fnuz>::max();
+    migraphx_fp8::fp8e4m3fnuz fp8_max(fmax);
+    EXPECT(fp8_max == std::numeric_limits<migraphx_fp8::fp8e4m3fnuz>::max());
+}
+
+TEST_CASE(test_numeric_lowest_1)
+{
+    float flowest = std::numeric_limits<float>::lowest();
+    migraphx_fp8::fp8e4m3fnuz fp8_lowest(flowest);
+    EXPECT(fp8_lowest == std::numeric_limits<migraphx_fp8::fp8e4m3fnuz>::lowest());
+}
+
+TEST_CASE(test_numeric_lowest_2)
+{
+    // gets clipped to lowest
+    float fmin = 2.0 * std::numeric_limits<migraphx_fp8::fp8e4m3fnuz>::lowest();
+    migraphx_fp8::fp8e4m3fnuz fp8_lowest(fmin);
+    EXPECT(fp8_lowest == std::numeric_limits<migraphx_fp8::fp8e4m3fnuz>::lowest());
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -237,12 +237,12 @@ TEST_CASE(code_object_hip)

    std::vector<migraphx::shape> expected_inputs = {input, input};
    auto co                                      = migraphx::make_op("gpu::code_object",
-                                {{"code_object", migraphx::value::binary{binaries.front()}},
-                                 {"symbol_name", "add_2"},
-                                 {"global", input.elements()},
-                                 {"local", 1024},
-                                 {"expected_inputs", migraphx::to_value(expected_inputs)},
-                                 {"output", migraphx::to_value(input)}});
+                                                                     {{"code_object", migraphx::value::binary{binaries.front()}},
+                                                                      {"symbol_name", "add_2"},
+                                                                      {"global", input.elements()},
+                                                                      {"local", 1024},
+                                                                      {"expected_inputs", migraphx::to_value(expected_inputs)},
+                                                                      {"output", migraphx::to_value(input)}});

    migraphx::program p;
    auto* mm            = p.get_main_module();
@@ -348,7 +348,10 @@ TEST_CASE(compile_math)
    auto vec_sizes = {2, 4, 6};
    for(auto&& t : migraphx::shape::types())
    {
-        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
+        if(contains({migraphx::shape::bool_type,
+                     migraphx::shape::fp8e4m3fnuz_type,
+                     migraphx::shape::tuple_type},
+                    t))
            continue;
        auto name = migraphx::shape::cpp_type(t);
        if(t == migraphx::shape::half_type)
@@ -396,7 +399,10 @@ TEST_CASE(assert_type_min_max)
    migraphx::gpu::hip_compile_options options;
    for(auto&& t : migraphx::shape::types())
    {
-        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
+        if(contains({migraphx::shape::bool_type,
+                     migraphx::shape::fp8e4m3fnuz_type,
+                     migraphx::shape::tuple_type},
+                    t))
            continue;
        auto name = migraphx::shape::cpp_type(t);
        if(t == migraphx::shape::half_type)

--- a/tools/api/migraphx.h
+++ b/tools/api/migraphx.h
@@ -44,7 +44,8 @@
    m(int32_type, int32_t) \
    m(int64_type, int64_t) \
    m(uint32_type, uint32_t) \
-    m(uint64_type, uint64_t)
+    m(uint64_type, uint64_t) \
+    m(fp8e4m3fnuz_type, migraphx_fp8::fp8e4m3fnuz)
 // clang-format on

 #ifdef __cplusplus
@@ -70,7 +71,9 @@ typedef enum
 } migraphx_shape_datatype_t;
 #undef MIGRAPHX_SHAPE_GENERATE_ENUM_TYPES

-<% generate_c_header() %>
+<%
+    generate_c_header()
+%>

 #ifdef __cplusplus
 }