port gpu changes

a9dd42f7 · Umang Yadav · 9e6d866d · a9dd42f7 · a9dd42f7 · a9dd42f7
Commit a9dd42f7 authored Nov 17, 2023 by Umang Yadav
17 changed files
--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -54,6 +54,11 @@ vectorize vectorize::elements(std::size_t axis,
                              const std::vector<shape>& inputs,
                              const std::vector<std::size_t>& sizes)
 {
+    // disable vectorization for fp8 types
+    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
+           return ishape.type() == migraphx::shape::fp8e4m3fnuz_type;
+       }))
+        return {1, axis};
    if(std::all_of(
           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
        return {1, axis};
@@ -86,6 +91,11 @@ vectorize vectorize::elements(std::size_t axis,
 vectorize vectorize::elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs)
 {
+    // disable vectorization for fp8 types
+    if(std::any_of(inputs.begin(), inputs.end(), [&](auto ishape) {
+           return ishape.type() == migraphx::shape::fp8e4m3fnuz_type;
+       }))
+        return {1, axis};
    if(inputs.empty())
        return {1, axis};
    std::size_t n = std::max_element(inputs.begin(),
@@ -305,7 +315,7 @@ std::string generate_reduce(const module& m, const std::string& name)
            std::transform(
                params.begin(), params.end(), params.begin(), [](auto s) { return "auto " + s; });
            return interpolate_string(inner_template,
-                                      {{"inner", inner_name},
+                                             {{"inner", inner_name},
                                       {"params", join_strings(params, ", ")},
                                       {"args", join_strings(args, ", ")},
                                       {"call", call_function}});

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -199,7 +199,7 @@ std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<hiprtc_sr
 {
    hiprtc_program prog(std::move(srcs));
    auto options = split_string(params, ' ');
-    options.push_back("-DMIGRAPHX_USE_HIPRTC=1");
+    options.push_back("-DMIGRAPHX_JIT_USE_HIPRTC=1");
    // remove following three compilation flags for HIPRTC once fixes from hipRTC are available in
    if(enabled(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS{}))
    {
@@ -251,21 +251,10 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
            std::cout << std::string(src.content) << std::endl;
        }
    }
-    auto fname = fs::path{"migraphx-hiprtc-driver"};
-#ifdef _WIN32
-    fname.replace_extension(".exe");
-#endif
    auto p      = dynamic_loader::path(&compile_hip_src_with_hiprtc);
-    auto driver = p.parent_path() / fname;
+    auto driver = p.parent_path().parent_path() / "bin" / "migraphx-hiprtc-driver";
-    bool found = fs::exists(driver);
-    if(not found)
-    {
-        driver = p.parent_path().parent_path() / "bin" / fname;
-        found  = fs::exists(driver);
-    }
-    if(found)
+    if(fs::exists(driver))
    {
        value v;
        v["srcs"]   = to_value(hsrcs);

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -197,6 +197,7 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
    options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
+    options.params += " -D__HIP_NO_F8_CONVERSIONS__=1";
    options.params += " " + join_strings(compiler_warnings(), " ");
    options.params += " -ftemplate-backtrace-limit=0";
    options.params += " -Werror";

--- a/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+#ifndef MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+#define MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+namespace migraphx {
+template <typename To, typename From>
+inline constexpr To bit_cast(From fr) noexcept
+{
+    static_assert(sizeof(To) == sizeof(From));
+    return __builtin_bit_cast(To, fr);
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+#ifndef MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#define MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+#define CONST_FOLD(x) (__builtin_constant_p(x) ? (x) : (x))
+namespace migraphx {
+namespace detail {
+template <bool B, class T, class F>
+struct conditional
+{
+    using type = T;
+};
+template <class T, class F>
+struct conditional<false, T, F>
+{
+    using type = F;
+};
+template <typename To, typename From>
+inline constexpr To bit_cast(From fr) noexcept
+{
+    static_assert(sizeof(To) == sizeof(From));
+#if defined(__GNUC__) and !defined(__clang__)
+    To x = CONST_FOLD(*reinterpret_cast<To*>(&fr));
+#else
+    To x = __builtin_bit_cast(To, fr);
+#endif
+    return x;
+}
+} // namespace detail
+namespace fp8 {
+namespace impl {
+// #ifdef __HIP_PLATFORM_HCC__
+// __device__ inline int clz(uint32_t x) { return __clz(x); }
+// #else
+// __host__ inline int clz(uint32_t x) { return __builtin_clz(x); }
+// #endif
+template <int wm, int we, typename T, bool negative_zero_nan, bool clip>
+MIGRAPHX_HIP_HOST_DEVICE constexpr uint8_t cast_to_f8(T _x, bool stoch, uint32_t rng)
+{
+    static_assert(wm + we == 7, "wm+we==7");
+    const int mfmt = (sizeof(T) == 4) ? 23 : 10;
+    typename migraphx::detail::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type x;
+    if constexpr(sizeof(T) == 4)
+        x = migraphx::detail::bit_cast<uint32_t>(_x);
+    else
+        x = migraphx::detail::bit_cast<uint16_t>(_x);
+    uint32_t head, mantissa;
+    int exponent, bias;
+    uint32_t sign;
+    if constexpr(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+    }
+    uint32_t signed_inf = (sign << 7) + (((1 << we) - 1) << wm);
+    // Deal with inf and NaNs
+    if(negative_zero_nan)
+    {
+        if(sizeof(T) == 4)
+        {
+            if((x & 0x7F800000) == 0x7F800000)
+                return 0x80;
+        }
+        else
+        {
+            // if(__hisinf(x) || __hisnan(x))
+            if((x & 0x7C00) == 0x7C00)
+                return 0x80;
+        }
+    }
+    else
+    {
+        if(sizeof(T) == 4)
+        {
+            if((x & 0x7F800000) == 0x7F800000)
+                return signed_inf + (mantissa != 0 ? 1 : 0);
+        }
+        else
+        {
+            if((x & 0x7C00) == 0x7C00)
+                return signed_inf + (mantissa != 0 ? 1 : 0);
+        }
+    }
+    // handle positive zero
+    if(x == 0)
+        return 0;
+    // handle negative zero
+    if((sizeof(T) == 4 and x == 0x80000000) or (sizeof(T) == 2 and x == 0x8000))
+    {
+        if(negative_zero_nan)
+        {
+            return 0;
+        }
+        else
+        {
+            return 0x80;
+        }
+    }
+    // First need to check if it is normal or denorm as there is a difference of implict 1
+    // Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
+    // The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
+    // RNE, no need to add rng. Then probably need to check whether there is carry and adjust
+    // exponent and mantissa again
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
+    const int f8_bias                  = (1 << (we - 1)) - 1 + (negative_zero_nan ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    // act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    // f8_exponent is the converted f8 exponent with bias encoding
+    // exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    // the difference needs to be adjusted and mantissa shifted
+    int act_exponent, f8_exponent, exponent_diff;
+    if(exponent == 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
+here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal has
+exponent bias 15 while bf8 with NANOO has exponent bias 16. It means that there are some numbers in
+fp16 denormal but they are bf8 (NANOO) normals - smallest bf8 (NANOO) normal is 2^-15. fp16 numbers
+where exponent==0 (actual exponent -14) and highest bit of mantissa is 1 are bf8 (NANOO) normal. In
+this case, the fp16 mantissa should be shift left by 1  */
+        act_exponent  = exponent - bias + 1;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
+   For example fp8 nanoo mode, denormal exponent is -7, but if the fp32/fp16
+   actual exponent is -7, it is actually larger due to the implict 1,
+   Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+   So for fp32/fp16, exponent -8 is the cut point to convert to fp8 nanoo */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        {          // both fp32/fp16 and f8 are in normal range
+            exponent_diff =
+                0; // exponent_diff=0 does not mean there is no difference for this case,
+            // act_exponent could be larger. Just that it does not need shift mantissa
+        }
+        mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
+    }
+    bool midpoint = (mantissa & ((1 << (mfmt - wm + exponent_diff)) - 1)) ==
+                    (1 << (mfmt - wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we
+ shift right as shift right could rip off some residual part and make something not midpoint look
+ like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than
+ midpoint, but after shift right by 4 bits, it would look like midpoint.
+*/
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1 << mfmt);
+    // if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+    // Now we have the exponent and mantissa adjusted
+    uint32_t drop_mask = (1 << (mfmt - wm)) - 1;
+    bool odd =
+        mantissa & (1 << (mfmt - wm)); // if the least significant bit that is not truncated is 1
+    mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
+    // Now we deal with overflow
+    if(f8_exponent == 0)
+    {
+        if((1 << mfmt) & mantissa)
+        {
+            f8_exponent = 1; // denormal overflow to become normal, promote exponent
+        }
+    }
+    else
+    {
+        if((1 << (mfmt + 1)) & mantissa)
+        {
+            mantissa >>= 1;
+            f8_exponent++;
+        }
+    }
+    mantissa >>= (mfmt - wm);
+    // above range: quantize to maximum possible float of the same sign
+    const int max_exp = (1 << we) - (negative_zero_nan ? 1 : 2);
+    if(f8_exponent > max_exp)
+    {
+        if(clip)
+        {
+            mantissa    = (1 << wm) - 1;
+            f8_exponent = max_exp;
+        }
+        else
+        {
+            return signed_inf;
+        }
+    }
+    if(f8_exponent == 0 && mantissa == 0)
+        return negative_zero_nan ? 0 : (sign << 7);
+    mantissa &= (1 << wm) - 1;
+    return (sign << 7) | (f8_exponent << wm) | mantissa;
+}
+template <int wm, int we, typename T, bool negative_zero_nan>
+MIGRAPHX_HIP_HOST_DEVICE constexpr T cast_from_f8(uint8_t x)
+{
+    constexpr int weo = 8;
+    constexpr int wmo = 23;
+    T fInf, fNegInf, fNaN, fNeg0;
+    uint32_t ifInf    = 0x7F800000;
+    uint32_t ifNegInf = 0xFF800000;
+    uint32_t ifNaN    = 0x7F800001;
+    uint32_t ifNeg0   = 0x80000000;
+    // TODO: need to change T for half but right now it would never  called with half
+    fInf    = migraphx::detail::bit_cast<float>(ifInf);
+    fNegInf = migraphx::detail::bit_cast<float>(ifNegInf);
+    fNaN    = migraphx::detail::bit_cast<float>(ifNaN);
+    fNeg0   = migraphx::detail::bit_cast<float>(ifNeg0);
+    if(x == 0)
+        return 0;
+    uint32_t sign     = x >> 7;
+    uint32_t mantissa = x & ((1 << wm) - 1);
+    int exponent      = (x & 0x7F) >> wm;
+    if(negative_zero_nan)
+    {
+        if(x == 0x80)
+            return fNaN;
+    }
+    else
+    {
+        if(x == 0x80)
+            return fNeg0;
+        if(exponent == ((1 << we) - 1))
+            return (mantissa == 0) ? (sign ? fNegInf : fInf) : fNaN;
+    }
+    typename migraphx::detail::conditional<sizeof(T) == 2, uint16_t, uint32_t>::type retval;
+    const int exp_low_cutoff = (1 << (weo - 1)) - (1 << (we - 1)) + 1 - (negative_zero_nan ? 1 : 0);
+    // subnormal input
+    if(exponent == 0)
+    {
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __builtin_clz(mantissa) - (32 - wm);
+        mantissa <<= sh;
+        exponent += 1 - sh;
+        mantissa &= ((1 << wm) - 1);
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - wm;
+    // subnormal output (occurs when T=half, we=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;
+        mantissa >>= 1 - exponent;
+        exponent = 0;
+    }
+    if(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa;
+    else
+        retval = (sign << 31) | (exponent << 23) | mantissa;
+    return migraphx::detail::bit_cast<T>(retval);
+}
+} // namespace impl
+} // namespace fp8
+} // namespace migraphx
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+#endif // MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
@@ -24,7 +24,7 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_HIP_HPP
 #define MIGRAPHX_GUARD_KERNELS_HIP_HPP
-#ifndef MIGRAPHX_USE_HIPRTC
+#ifndef MIGRAPHX_JIT_USE_HIPRTC
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
 #include <hip/math_functions.h>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -34,6 +34,9 @@ namespace migraphx {
 namespace math {
 constexpr float as_float(migraphx::half x) { return x; }
+constexpr float as_float(migraphx::fp8::fp8e4m3fnuz x) { return x; }
 template <class T>
 constexpr T as_float(T x)
 {
@@ -57,14 +60,14 @@ constexpr T as_float(T x)
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname)                    \
    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
-    auto __device__ name(type x, Ts... xs)->type                       \
+    auto __device__ name(type x, Ts... xs) -> type                     \
    {                                                                  \
        return fname(x, xs...);                                        \
    }
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_BINARY_FOR(type, name, fname) \
-    inline auto __device__ name(type x, type y)->type { return fname(x, y); }
+    inline auto __device__ name(type x, type y) -> type { return fname(x, y); }
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_HALF(name, fname)                         \
@@ -72,6 +75,20 @@ constexpr T as_float(T x)
    auto __device__ name(migraphx::half x, Ts... xs)                   \
        MIGRAPHX_RETURNS(fname(math::as_float(x), math::as_float(xs)...))
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_FP8(name, fname)                                      \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())>             \
+    auto __device__ name(migraphx::fp8::fp8e4m3fnuz x, Ts... xs) MIGRAPHX_RETURNS( \
+        migraphx::fp8::fp8e4m3fnuz(fname(math::as_float(x), math::as_float(xs)...)))
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_BINARY_FOR_FP8(name, fname)                                    \
+    inline auto __device__ name(migraphx::fp8::fp8e4m3fnuz x, migraphx::fp8::fp8e4m3fnuz y) \
+        -> migraphx::fp8::fp8e4m3fnuz                                                       \
+    {                                                                                       \
+        return migraphx::fp8::fp8e4m3fnuz(fname(math::as_float(x), math::as_float(y)));     \
+    }
 // Template with two overloads for math functions, one for half2 type and one for more generic
 // <half, N> vectorization where N is 4 or another even number.
@@ -162,6 +179,33 @@ MIGRAPHX_DEVICE_MATH_HALF(tan, ::tan)
 MIGRAPHX_DEVICE_MATH_HALF(tanh, ::tanh)
 MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)
+// use float to compute fp8 overload
+MIGRAPHX_DEVICE_MATH_FP8(abs, ::abs)
+MIGRAPHX_DEVICE_MATH_FP8(acos, ::acos)
+MIGRAPHX_DEVICE_MATH_FP8(acosh, ::acosh)
+MIGRAPHX_DEVICE_MATH_FP8(asin, ::asin)
+MIGRAPHX_DEVICE_MATH_FP8(asinh, ::asinh)
+MIGRAPHX_DEVICE_MATH_FP8(atan, ::atan)
+MIGRAPHX_DEVICE_MATH_FP8(atanh, ::atanh)
+MIGRAPHX_DEVICE_MATH_FP8(ceil, ::ceil)
+MIGRAPHX_DEVICE_MATH_FP8(cos, ::cos)
+MIGRAPHX_DEVICE_MATH_FP8(cosh, ::cosh)
+MIGRAPHX_DEVICE_MATH_FP8(erf, ::erf)
+MIGRAPHX_DEVICE_MATH_FP8(exp, ::exp)
+MIGRAPHX_DEVICE_MATH_FP8(floor, ::floor)
+MIGRAPHX_DEVICE_MATH_FP8(isnan, ::isnan)
+MIGRAPHX_DEVICE_MATH_FP8(log, ::log)
+MIGRAPHX_DEVICE_MATH_FP8(pow, ::pow)
+MIGRAPHX_DEVICE_MATH_FP8(remainder, ::remainder)
+MIGRAPHX_DEVICE_MATH_FP8(round, ::round)
+MIGRAPHX_DEVICE_MATH_FP8(rsqrt, ::rsqrt)
+MIGRAPHX_DEVICE_MATH_FP8(sin, ::sin)
+MIGRAPHX_DEVICE_MATH_FP8(sinh, ::sinh)
+MIGRAPHX_DEVICE_MATH_FP8(sqrt, ::sqrt)
+MIGRAPHX_DEVICE_MATH_FP8(tan, ::tan)
+MIGRAPHX_DEVICE_MATH_FP8(tanh, ::tanh)
+MIGRAPHX_DEVICE_MATH_FP8(fmod, ::fmod)
 // Map math functions to hip half2 functions
 // The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
 // packed into a 32-bit number.  See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
@@ -195,6 +239,9 @@ MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, min, ::min)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, max, ::__hmax)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, min, ::__hmin)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR_FP8(max, ::max)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR_FP8(min, ::min)
 template <class T, MIGRAPHX_REQUIRES(not is_any_vec<T>())>
 constexpr auto max(const T& a, const T& b)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -26,6 +26,7 @@
 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/float8.hpp>
 namespace migraphx {
@@ -230,7 +231,8 @@ constexpr unsigned long int_max(unsigned long n)
 template <class T,
          MIGRAPHX_REQUIRES(is_integral<T>{} or is_floating_point<T>{} or
-                            is_same<T, migraphx::half>{})>
+                            is_same<T, migraphx::half>{} or
+                            is_same<T, migraphx::fp8::fp8e4m3fnuz>{})>
 constexpr T numeric_max()
 {
    if constexpr(is_integral<T>{})
@@ -246,6 +248,8 @@ constexpr T numeric_max()
        return __FLT_MAX__;
    else if constexpr(is_same<T, migraphx::half>{})
        return __FLT16_MAX__;
+    else if constexpr(is_same<T, migraphx::fp8::fp8e4m3fnuz>{})
+        return migraphx::fp8::F8_Max<T>();
    else
        return 0;
 }
@@ -260,6 +264,8 @@ constexpr T numeric_lowest()
        else
            return -numeric_max<T>() - 1;
    }
+    else if constexpr(is_same<T, migraphx::fp8::fp8e4m3fnuz>{})
+        return migraphx::fp8::F8_Lowest<T>();
    else
    {
        return -numeric_max<T>();

--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -23,12 +23,11 @@
 */
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
 #include <migraphx/kernels/hip.hpp>
 namespace migraphx {
-#if defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS) and defined(MIGRAPHX_USE_HIPRTC)
+#if defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS) and defined(MIGRAPHX_JIT_USE_HIPRTC)
 using int8_t   = signed char;
 using uint8_t  = unsigned char;
 using int16_t  = signed short;
@@ -37,7 +36,7 @@ using int32_t  = signed int;
 using uint32_t = unsigned int;
 using int64_t  = signed long long;
 using uint64_t = unsigned long long;
-#elif defined(MIGRAPHX_USE_HIPRTC)
+#elif defined(MIGRAPHX_JIT_USE_HIPRTC)
 using int8_t   = __hip_int8_t;
 using uint8_t  = __hip_uint8_t;
 using int16_t  = __hip_int16_t;
@@ -55,7 +54,7 @@ using int32_t  = std::int32_t;
 using uint32_t = std::uint32_t;
 using int64_t  = std::int64_t;
 using uint64_t = std::uint64_t;
-#endif // MIGRAPHX_USE_HIPRTC
+#endif // MIGRAPHX_JIT_USE_HIPRTC
 using index_int = uint32_t;
 using diff_int  = int32_t;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
 #define MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
+#include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/tensor_view.hpp>
 #include <migraphx/kernels/vec.hpp>

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -98,6 +98,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    ctx.set_exhaustive_tune_flag(options.exhaustive_tune);
    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
    unsupported_types.erase(shape::type_t::float_type);
+    unsupported_types.erase(shape::type_t::fp8e4m3fnuz_type);
    unsupported_types.erase(shape::type_t::half_type);
    unsupported_types.erase(shape::type_t::bool_type);
    unsupported_types.erase(shape::type_t::int8_type);

--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -144,7 +144,7 @@ extern "C" {
 __global__ void kernel(${type}* p) 
 {
    auto x = *p;
-    *p = migraphx::implicit_conversion(migraphx::${invoke});
+    *p = implicit_conversion(migraphx::${invoke});
 }
 }
@@ -348,18 +348,18 @@ TEST_CASE(compile_math)
    auto vec_sizes = {2, 4, 6};
    for(auto&& t : migraphx::shape::types())
    {
-        if(contains({migraphx::shape::bool_type,
+        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
-                     migraphx::shape::fp8e4m3fnuz_type,
-                     migraphx::shape::tuple_type},
-                    t))
            continue;
        auto name = migraphx::shape::cpp_type(t);
        if(t == migraphx::shape::half_type)
            name.insert(0, "migraphx::");
        data_types.push_back(name);
-        migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
+        if(t != migraphx::shape::fp8e4m3fnuz_type)
-            return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
+        {
-        });
+            migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
+                return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
+            });
+        }
    }
    migraphx::shape input{migraphx::shape::float_type, {5, 2}};
    migraphx::gpu::hip_compile_options options;
@@ -399,10 +399,7 @@ TEST_CASE(assert_type_min_max)
    migraphx::gpu::hip_compile_options options;
    for(auto&& t : migraphx::shape::types())
    {
-        if(contains({migraphx::shape::bool_type,
+        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
-                     migraphx::shape::fp8e4m3fnuz_type,
-                     migraphx::shape::tuple_type},
-                    t))
            continue;
        auto name = migraphx::shape::cpp_type(t);
        if(t == migraphx::shape::half_type)
@@ -429,7 +426,6 @@ TEST_CASE(assert_type_min_max)
                min = std::to_string(as.min());
                max = std::to_string(as.max());
            }
            auto src = migraphx::interpolate_string(assert_template,
                                                    {{"type", name}, {"max", max}, {"min", min}});
            migraphx::shape input{migraphx::shape::float_type, {5, 2}};

--- a/test/verify/test_abs.cpp
+++ b/test/verify/test_abs.cpp
@@ -27,14 +27,19 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>
-struct test_abs : verify_program<test_abs>
+template <migraphx::shape::type_t DType>
+struct test_abs : verify_program<test_abs<DType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
        auto* mm = p.get_main_module();
-        auto x = mm->add_parameter("x", migraphx::shape{migraphx::shape::float_type, {4, 3, 3, 3}});
+        auto x   = mm->add_parameter("x", migraphx::shape{DType, {4, 3, 3, 3}});
        mm->add_instruction(migraphx::make_op("abs"), x);
        return p;
    }
 };
+template struct test_abs<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_abs<migraphx::shape::half_type>;
+template struct test_abs<migraphx::shape::float_type>;
--- a/test/verify/test_acos.cpp
+++ b/test/verify/test_acos.cpp
@@ -27,15 +27,20 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>
-struct test_acos : verify_program<test_acos>
+template <migraphx::shape::type_t DType>
+struct test_acos : verify_program<test_acos<DType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
        auto* mm = p.get_main_module();
-        migraphx::shape s{migraphx::shape::float_type, {16}};
+        migraphx::shape s{DType, {16}};
        auto x = mm->add_parameter("x", s);
        mm->add_instruction(migraphx::make_op("acos"), x);
        return p;
    }
 };
+template struct test_acos<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_acos<migraphx::shape::half_type>;
+template struct test_acos<migraphx::shape::float_type>;
--- a/test/verify/test_add.cpp
+++ b/test/verify/test_add.cpp
@@ -27,16 +27,21 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/make_op.hpp>
-struct test_add : verify_program<test_add>
+template <migraphx::shape::type_t DType>
+struct test_add : verify_program<test_add<DType>>
 {
    migraphx::program create_program() const
    {
        migraphx::program p;
        auto* mm = p.get_main_module();
-        migraphx::shape s{migraphx::shape::float_type, {3}};
+        migraphx::shape s{DType, {8}};
        auto x = mm->add_parameter("x", s);
        auto y = mm->add_parameter("y", s);
        mm->add_instruction(migraphx::make_op("add"), x, y);
        return p;
    }
 };
+template struct test_add<migraphx::shape::fp8e4m3fnuz_type>;
+template struct test_add<migraphx::shape::half_type>;
+template struct test_add<migraphx::shape::float_type>;
--- a/test/verify/test_literal_limits.cpp
+++ b/test/verify/test_literal_limits.cpp
@@ -35,7 +35,11 @@ struct test_literal_limits : verify_program<test_literal_limits<Q, T>>
        migraphx::program p;
        auto* mm          = p.get_main_module();
        auto input_s      = migraphx::shape(Q, {3, 1});
-        auto infinity_val = std::numeric_limits<T>::infinity();
+        auto infinity_val = std::numeric_limits<T>::max();
+        if constexpr(std::numeric_limits<T>::has_infinity)
+        {
+            infinity_val = std::numeric_limits<T>::infinity();
+        }
        std::vector<T> s_data{
            infinity_val, static_cast<T>(-infinity_val), std::numeric_limits<T>::quiet_NaN()};
@@ -52,3 +56,4 @@ template struct test_literal_limits<migraphx::shape::double_type, double>;
 template struct test_literal_limits<migraphx::shape::half_type, migraphx::half>;
 template struct test_literal_limits<migraphx::shape::int32_type, int32_t>;
 template struct test_literal_limits<migraphx::shape::int8_type, int8_t>;
+template struct test_literal_limits<migraphx::shape::fp8e4m3fnuz_type, migraphx::fp8::fp8e4m3fnuz>;