Merge branch 'develop' into optimize_jenkinsfile

a24ed87e · Chris Austen · GitHub · 6481cd69 · a09dc502 · a24ed87e
Unverified Commit a24ed87e authored Dec 05, 2023 by Chris Austen Committed by GitHub Dec 05, 2023
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm_impl.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -24,28 +24,64 @@
 #ifndef MIGRAPHX_GUARD_RTGLIB_GEMM_IMPL_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GEMM_IMPL_HPP
+#include <iterator>
 #include <migraphx/shape.hpp>
 #include <migraphx/argument.hpp>
 #include <migraphx/gpu/context.hpp>
+// Set this environment variable to "true" to perform GEMM tuning even when the
+// --exhaustive-tune option isn't set.  Can be used to skip slow convolution tuning.
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_GEMM_TUNING);
+using milliseconds = std::chrono::duration<double, std::milli>;
+using microseconds = std::chrono::duration<double, std::micro>;
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-void gemm(context& ctx,
+/**
-          const shape& output_shape,
+ * @brief Templated implementations of the compute() and finalize() methods of the Gemm operator.
-          const std::vector<argument>& args,
+ *        For each function there are overloads using either float or int32_t for the arguments
-          float alpha,
+ * alpha and beta.
-          float beta,
+ *
-          bool int8_x4_format,
+ * @param ctx .
-          bool compute_fp32);
+ * @param output_shape .
-void gemm(context& ctx,
+ * @param args .
-          const shape& output_shape,
+ * @param alpha .
-          const std::vector<argument>& args,
+ * @param beta .
-          int32_t alpha,
+ * @param compute_fp32 .
-          int32_t beta,
+ */
-          bool int8_x4_format,
+void gemm_compute(context& ctx,
-          bool compute_fp32);
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  float alpha,
+                  float beta,
+                  bool compute_fp32,
+                  int32_t solution_idx);
+void gemm_compute(context& ctx,
+                  const shape& output_shape,
+                  const std::vector<argument>& args,
+                  int32_t alpha,
+                  int32_t beta,
+                  bool compute_fp32,
+                  int32_t solution_idx);
+int32_t gemm_finalize(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<shape>& input_shapes,
+                      float alpha,
+                      float beta,
+                      bool compute_fp32);
+int32_t gemm_finalize(context& ctx,
+                      const shape& output_shape,
+                      const std::vector<shape>& input_shapes,
+                      int32_t alpha,
+                      int32_t beta,
+                      bool compute_fp32,
+                      int32_t solution_idx);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp
@@ -66,6 +66,10 @@ struct gemm_softmax_gemm
    }
    static bool is_ck_supported_type(shape::type_t t) { return contains({shape::half_type}, t); }
+    static bool is_mlir_supported_type(shape::type_t t)
+    {
+        return contains({shape::type_t::float_type, shape::half_type}, t);
+    }
 };
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -127,7 +127,7 @@ inline void set_tensor_descriptor(miopenTensorArgumentId_t name,
 }
 #endif
-inline tensor_descriptor make_tensor(const migraphx::shape& os, bool pack = false)
+inline tensor_descriptor make_tensor(const migraphx::shape& os)
 {
    auto s = os.normalize_standard();
    auto t = make_obj<tensor_descriptor>(&miopenCreateTensorDescriptor);
@@ -142,23 +142,9 @@ inline tensor_descriptor make_tensor(const migraphx::shape& os, bool pack = fals
    else if(s.type() == shape::int32_type)
        d = miopenInt32;
    else if(s.type() == shape::int8_type)
-    {
+        d = miopenInt8;
-        if(pack)
-        {
-            // update the lens and corresponding strides
-            d          = miopenInt8x4;
-            lens[1]    = ((lens[1] + 3) / 4) * 4;
-            strides[0] = strides[1] * lens[1];
-        }
-        else
-        {
-            d = miopenInt8;
-        }
-    }
    else
-    {
        MIGRAPHX_THROW("MAKE_TENSOR: unsupported type");
-    }
    miopenSetTensorDescriptor(t.get(), d, s.lens().size(), lens.data(), strides.data());
    return t;
@@ -225,6 +211,12 @@ inline pooling_descriptor make_pooling(const migraphx::op::pooling& op)
        ss << op.mode;
        MIGRAPHX_THROW(ss.str());
    }
+    if(not std::all_of(
+           op.dilations.cbegin(), op.dilations.cend(), [](std::size_t d) { return d == 1; }))
+    {
+        MIGRAPHX_THROW("Unsupported dilations for pooling: [" + to_string_range(op.dilations) +
+                       "]");
+    }
    auto p = make_obj<pooling_descriptor>(&miopenCreatePoolingDescriptor);
    int kdims = op.kdims();

--- a/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -40,7 +40,7 @@ struct context;
 MIGRAPHX_GPU_EXPORT bool get_compute_fp32_flag();
-MIGRAPHX_GPU_EXPORT bool get_int8_x4_format(context& ctx);
+MIGRAPHX_GPU_EXPORT bool rocblas_fp8_available();
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/int8_gemm_pack.cpp
+++ b/src/targets/gpu/int8_gemm_pack.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/int8_gemm_pack.hpp>
-#include <migraphx/gpu/device/int8_gemm_pack.hpp>
-#include <migraphx/gpu/context.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-shape hip_int8_gemm_pack_a::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{{inputs.at(0)}, *this}.has(1).not_broadcasted().packed();
-    return inputs.at(0);
-}
-argument
-hip_int8_gemm_pack_a::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    device::int8_gemm_pack_a(ctx.get_stream().get(), args[1], args[0]);
-    return args[1];
-}
-shape hip_int8_gemm_pack_b::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{{inputs.at(0)}, *this}.has(1).not_broadcasted().packed();
-    return inputs.at(0);
-}
-argument
-hip_int8_gemm_pack_b::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    device::int8_gemm_pack_b(ctx.get_stream().get(), args[1], args[0]);
-    return args[1];
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/include/migraphx/gpu/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gather.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,42 +21,58 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_GATHER_HPP
+#ifndef MIGRAPHX_GUARD_JIT_SCATTER_HPP
-#define MIGRAPHX_GUARD_RTGLIB_GATHER_HPP
+#define MIGRAPHX_GUARD_JIT_SCATTER_HPP
-#include <migraphx/argument.hpp>
+#include <migraphx/gpu/compiler.hpp>
-#include <migraphx/reflect.hpp>
+#include <migraphx/make_op.hpp>
-#include <migraphx/op/gather.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-struct context;
+template <typename Derived>
+struct scatter_compiler : compiler<Derived>
-struct hip_gather
 {
-    op::gather op;
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
    {
-        return migraphx::reflect(self.op, f);
+        const auto inputs =
+            to_shapes(std::vector<instruction_ref>{ins->inputs().begin() + 1, ins->inputs().end()});
+        hip_compile_options options;
+        options.set_launch_params(op.to_value(), compute_global_for(ctx, inputs.at(1).elements()));
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.kernel_name    = derived().get_kernel_name(op);
+        options.virtual_inputs = inputs;
+        // The compiler protests the inequality comparison in assign_mul when pertaining to floating
+        // point, despite it making sense in the context. Thus the warning removal.
+        options.params += "-Wno-float-equal";
+        const auto src = derived().make_interpolated_string(op);
+        return prepend_copy_data_to_output(compile_hip_code_object(src, options));
    }
-    std::string name() const { return "gpu::gather"; }
+    compiler_replace prepend_copy_data_to_output(const operation& co) const
-    shape compute_shape(std::vector<shape> inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
-        return shapes.size() - 1;
+        return {co, [](module& m, instruction_ref ins, const operation& op) {
+                    auto args = ins->inputs();
+                    args.back() =
+                        m.insert_instruction(ins, make_op("hip::copy"), args.front(), args.back());
+                    args.erase(args.begin());
+                    return m.replace_instruction(ins, op, args);
+                }};
    }
+    std::string get_kernel_name(const operation& op) const { return op.name() + "_kernel"; }
+    const Derived& derived() const { return static_cast<const Derived&>(*this); }
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -21,11 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/compiler.hpp>
+#include "scatter.hpp"
-#include <migraphx/make_op.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/compile_hip_code_object.hpp>
-#include <migraphx/gpu/compile_hip.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -55,46 +51,21 @@ MIGRAPHX_GLOBAL void scatternd_kernel(void* in_indices, void* in_updates, void*
 )__migraphx__";
-struct scatternd_compiler : compiler<scatternd_compiler>
+struct scatternd_compiler : scatter_compiler<scatternd_compiler>
 {
    std::vector<std::string> names() const
    {
-        return {"scatternd_none", "scatternd_add", "scatternd_mul"};
+        return {
+            "scatternd_none", "scatternd_add", "scatternd_mul", "scatternd_min", "scatternd_max"};
    }
-    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    std::string make_interpolated_string(const operation& op) const
    {
-        hip_compile_options options;
+        const auto reduction = op.name().substr(std::char_traits<char>::length("scatternd_"));
-        options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));
+        return interpolate_string(scatternd_kernel, {{"reduction", "assign_" + reduction}});
-        options.inputs         = inputs;
-        options.output         = inputs.back();
-        options.kernel_name    = "scatternd_kernel";
-        options.virtual_inputs = inputs;
-        auto reduction         = "assign_" + v.get("reduction", std::string{"none"});
-        auto src               = interpolate_string(scatternd_kernel, {{"reduction", reduction}});
-        return compile_hip_code_object(src, options);
    }
-    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    std::string get_kernel_name(const operation&) const { return "scatternd_kernel"; }
-    {
-        assert(starts_with(op.name(), "scatternd_"));
-        auto reduction = op.name().substr(10);
-        return insert(compile_op(
-            ctx,
-            to_shapes(std::vector<instruction_ref>{ins->inputs().begin() + 1, ins->inputs().end()}),
-            {{"reduction", reduction}}));
-    }
-    compiler_replace insert(const operation& co) const
-    {
-        return {co, [](module& m, instruction_ref ins, const operation& op) {
-                    auto args = ins->inputs();
-                    args.back() =
-                        m.insert_instruction(ins, make_op("hip::copy"), args.front(), args.back());
-                    args.erase(args.begin());
-                    return m.replace_instruction(ins, op, args);
-                }};
-    }
 };
 } // namespace gpu

--- a/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/bit_cast.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+#ifndef MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+#define MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
+#include <migraphx/kernels/type_traits.hpp>
+namespace migraphx {
+template <typename To,
+          typename From,
+          MIGRAPHX_REQUIRES(is_trivially_copyable<To>{} and is_trivially_copyable<From>{})>
+inline constexpr To bit_cast(From fr) noexcept
+{
+    static_assert(sizeof(To) == sizeof(From));
+    return __builtin_bit_cast(To, fr);
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_BITCAST_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
@@ -49,12 +49,8 @@ constexpr unsigned int dpp_row_bcast(unsigned int x)
    return y;
 }
-template <unsigned int DppCtrl,
+template <class T, class F>
-          unsigned int RowMask  = 0xf,
+__device__ T dpp_op(T& x, F f)
-          unsigned int BankMask = 0xf,
-          bool BoundCtrl        = false,
-          class T>
-__device__ T dpp_mov(T& x)
 {
    static const index_int n = sizeof(T) < 4 ? 1 : sizeof(T) / 4;
    union type
@@ -68,10 +64,28 @@ __device__ T dpp_mov(T& x)
    input.data = x;
    for(index_int i = 0; i < n; i++)
    {
-        output.reg[i] = __hip_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
+        output.reg[i] = f(input.reg[i]);
    }
    return output.data;
 }
+template <unsigned int DppCtrl,
+          unsigned int RowMask  = 0xf,
+          unsigned int BankMask = 0xf,
+          bool BoundCtrl        = false,
+          class T>
+__device__ T dpp_mov(T& x)
+{
+    return dpp_op(x,
+                  [](auto i) { return __hip_move_dpp(i, DppCtrl, RowMask, BankMask, BoundCtrl); });
+}
+template <unsigned int Mask, class T>
+__device__ T dpp_swizzle(T& x)
+{
+    return dpp_op(x, [](auto i) { return __hip_ds_swizzle(i, Mask); });
+}
 #endif // MIGRAPHX_HAS_DPP
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+#ifndef MIGRAPHX_GUARD_KERNELS_FLOAT8_HPP
+#define MIGRAPHX_GUARD_KERNELS_FLOAT8_HPP
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wfloat-equal"
+#pragma clang diagnostic ignored "-Wc++20-extensions" // required for "asm" inside constexpr
+#endif                                                // __clang__
+// We are clipping in down conversion by default
+#define MIGRAPHX_F8_DOWNCAST_CLIPPING 1 // NOLINT
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/float8_impl.hpp>
+namespace migraphx {
+namespace fp8 {
+enum class rounding_mode
+{
+    standard, // standard rounding is doing RNE -- round to nearest even
+    stochastic
+};
+enum class f8_type
+{
+    bf8 = 0, // s1e5m2
+    fp8 = 1  // s1e4m3
+};
+template <typename T>
+class numeric_limits;
+template <migraphx::fp8::f8_type T = migraphx::fp8::f8_type::fp8, bool FNUZ = true>
+struct float8
+{
+    uint8_t data;
+    // default constructor
+    __device__ constexpr float8() = default;
+    // default copy constructor
+    __device__ constexpr float8(const float8& y) = default;
+    struct from_bits_t
+    {
+    };
+    static constexpr __device__ from_bits_t from_bits() { return from_bits_t(); }
+    __device__ explicit constexpr float8(uint8_t bits, from_bits_t) : data(bits) {}
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // device specific optimized F8 down-conversion code
+    template <bool stochastic_rounding = false>
+    static __device__ uint8_t cast_to_f8_from_f32(float v, uint32_t rng = 0)
+    {
+        uint8_t i8data = 0x00;
+        union
+        {
+            float fval;
+            uint32_t i32val;
+            uint8_t i8val[4]; // NOTE: not endian independent
+        } val;
+        uint32_t ival = 0;
+        val.fval      = v;
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000) /// propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 240.0, -240.0);
+        }
+        else
+        {
+            if((val.i32val & 0x7F800000) != 0x7F800000) // propagate NAN/INF, no clipping
+                val.fval = __builtin_amdgcn_fmed3f(val.fval, 57344.0, -57344.0);
+        }
+#endif
+        if(stochastic_rounding)
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                ival = __builtin_amdgcn_cvt_sr_fp8_f32(val.fval, rng, ival, 0); // 0 pos
+            }
+            else
+            {
+                ival = __builtin_amdgcn_cvt_sr_bf8_f32(val.fval, rng, ival, 0); // 0 pos
+            }
+        }
+        else // RNE CVT
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                ival = __builtin_amdgcn_cvt_pk_fp8_f32(
+                    val.fval, val.fval, ival, false); // false -> WORD0
+            }
+            else
+            {
+                ival = __builtin_amdgcn_cvt_pk_bf8_f32(
+                    val.fval, val.fval, ival, false); // false -> WORD0}
+            }
+        }
+        val.i32val = ival;
+        i8data     = val.i8val[0]; // little endian
+        return i8data;
+    }
+#endif // __gfx940__
+       // constructor from float
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+    // NOTE: ON-DEVICE... always optimal bias
+    explicit constexpr __device__
+    float8(const float v,
+           migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
+           uint32_t rng                    = 0)
+    {
+        if(__builtin_is_constant_evaluated())
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+                data = migraphx::fp8::impl::
+                    cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                        v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+                data = migraphx::fp8::impl::
+                    cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                        v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            }
+            else
+            {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+                data = migraphx::fp8::impl::
+                    cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                        v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+                data = migraphx::fp8::impl::
+                    cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                        v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_FP8_DOWNCAST_CLIPPING}
+            }
+        }
+        else
+        {
+            // runtime branch, use cast_to_f8_from_f32 if want to avoid it
+            if(rm == migraphx::fp8::rounding_mode::stochastic)
+                data = cast_to_f8_from_f32<true>(v, rng);
+            else
+                data = cast_to_f8_from_f32<false>(v);
+        }
+    }
+#else
+    // DEVICE for non-gfx940 using s/w simulation
+    explicit constexpr __device__
+    float8(const float v,
+           migraphx::fp8::rounding_mode rm = migraphx::fp8::rounding_mode::standard,
+           uint32_t rng                    = 0)
+    {
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<3, 4, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_F8_DOWNCAST_CLIPPING
+        }
+        else
+        {
+#ifdef MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, true /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#else  // MIGRAPHX_F8_DOWNCAST_CLIPPING
+            data = migraphx::fp8::impl::
+                cast_to_f8<2, 5, float, FNUZ /*negative_zero_nan*/, false /*clip*/>(
+                    v, (rm == migraphx::fp8::rounding_mode::stochastic), rng);
+#endif // MIGRAPHX_FP8_DOWNCAST_CLIPPING}
+        }
+    }
+#endif // __gfx940___
+    // Constructor from half
+    explicit constexpr __device__
+    float8(const _Float16 v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+    // constructor from int
+    explicit constexpr __device__
+    float8(const int v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+    // constructor from uint
+    explicit constexpr __device__
+    float8(const uint32_t v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+    // constructor from double
+    explicit constexpr __device__
+    float8(const double v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+    // constructor from bool
+    explicit constexpr __device__
+    float8(const bool v, rounding_mode rm = rounding_mode::standard, uint32_t rng = 0)
+        : float8(static_cast<float>(v), rm, rng)
+    {
+    }
+    // convert to float
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) // NOLINT
+    // upcast using device specific intrinsic
+    inline constexpr __device__ operator float() const
+    {
+        if(__builtin_is_constant_evaluated())
+        {
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                return migraphx::fp8::impl::cast_from_f8<3, 4, float, FNUZ /*negative_zero_nan*/>(
+                    data);
+            } // else
+            return migraphx::fp8::impl::cast_from_f8<2, 5, float, FNUZ /*negative_zero_nan*/>(data);
+        }
+        else
+        {
+            float fval      = 0;
+            uint32_t i32val = static_cast<uint32_t>(data);
+            // upcast
+            if constexpr(T == migraphx::fp8::f8_type::fp8)
+            {
+                __asm__ volatile("v_cvt_f32_fp8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
+            }
+            else
+            {
+                __asm__ volatile("v_cvt_f32_bf8 %0, %1 src0_sel:BYTE_0" : "=v"(fval) : "v"(i32val));
+            }
+            return fval;
+        }
+    }
+#else // non gfx940
+    inline constexpr __device__ operator float() const
+    {
+        if constexpr(T == migraphx::fp8::f8_type::fp8)
+        {
+            return migraphx::fp8::impl::cast_from_f8<3, 4, float, FNUZ /*negative_zero_nan*/>(data);
+        } // else
+        return migraphx::fp8::impl::cast_from_f8<2, 5, float, FNUZ /*negative_zero_nan*/>(data);
+    }
+#endif
+    inline constexpr explicit __device__ operator bool() const { return not is_zero(); }
+    // check for zero
+    inline __device__ constexpr bool is_zero() const
+    {
+        if constexpr(FNUZ)
+        {
+            return data == 0x00;
+        }
+        else
+        {
+            return (data == 0x00) || (data == 0x80);
+        }
+    }
+    // check for nan
+    inline __device__ constexpr bool is_nan() const
+    {
+        if constexpr(FNUZ)
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == migraphx::fp8::f8_type::bf8)
+            {
+                return (data == 0x7D) or (data == 0x7E) or (data == 0x7F) or (data == 0xFD) or
+                       (data == 0xFE) or (data == 0xFF);
+            }
+            else
+            {
+                return (data == 0x7F) or (data == 0xFF);
+            }
+        }
+    }
+    // check for inf
+    inline __device__ constexpr bool is_inf() const
+    {
+        if constexpr(FNUZ)
+        {
+            return data == 0x80;
+        }
+        else
+        {
+            if(T == migraphx::fp8::f8_type::bf8)
+            {
+                return (data == 0x7C) or (data == 0xFC);
+            }
+            else
+            {
+                // no infinities in e4m3fn, represent them as NaNs
+                return (data == 0x7F) or (data == 0xFF);
+            }
+        }
+    }
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_SHORT_UNARY_OP(unary_op, binary_op)                              \
+    constexpr float8& __device__ operator unary_op(const float8& rhs)                 \
+    {                                                                                 \
+        const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
+        *this          = static_cast<float8>(tmp);                                    \
+        return *this;                                                                 \
+    }                                                                                 \
+    constexpr float8& __device__ operator unary_op(const float& rhs)                  \
+    {                                                                                 \
+        const auto tmp = static_cast<float>(*this) binary_op static_cast<float>(rhs); \
+        *this          = static_cast<float8>(tmp);                                    \
+        return *this;                                                                 \
+    }
+    MIGRAPHX_FP8_SHORT_UNARY_OP(*=, *)
+    MIGRAPHX_FP8_SHORT_UNARY_OP(-=, -)
+    MIGRAPHX_FP8_SHORT_UNARY_OP(+=, +)
+    MIGRAPHX_FP8_SHORT_UNARY_OP(/=, /)
+    inline __device__ constexpr float8& operator=(const float8& rhs)     = default;
+    inline __device__ constexpr float8& operator=(float8&& rhs) noexcept = default;
+    inline __device__ constexpr bool operator<(const float8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we < them;
+    }
+    inline __device__ constexpr bool operator>(const float8& rhs) const
+    {
+        const auto we   = static_cast<float>(*this);
+        const auto them = static_cast<float>(rhs);
+        return we > them;
+    }
+};
+// https://onnx.ai/onnx/technical/float8.html
+using fp8e4m3fn   = float8<migraphx::fp8::f8_type::fp8, false>;
+using fp8e5m2     = float8<migraphx::fp8::f8_type::bf8, false>;
+using fp8e4m3fnuz = float8<migraphx::fp8::f8_type::fp8, true>;
+using fp8e5m2fnuz = float8<migraphx::fp8::f8_type::bf8, true>;
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_BINARY_OP(binary_op, T, U)                                  \
+    inline constexpr U __device__ operator binary_op(const T& lhs, const T& rhs) \
+    {                                                                            \
+        return U(static_cast<float>(lhs) binary_op static_cast<float>(rhs));     \
+    }
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_OTHER_OPS(T)                                            \
+    inline constexpr __device__ T fabs(T v)                                  \
+    {                                                                        \
+        /*NOLINTNEXTLINE*/                                                   \
+        v.data = v.data & 0x7f;                                              \
+        return v;                                                            \
+    }                                                                        \
+    inline __device__ constexpr bool operator==(const T& lhs, const T& rhs)  \
+    {                                                                        \
+        if(rhs.is_nan() or rhs.is_inf() or lhs.is_nan() or lhs.is_inf())     \
+            return false;                                                    \
+        else if((rhs.is_zero() and lhs.is_zero()) or (lhs.data == rhs.data)) \
+            return true;                                                     \
+        return false;                                                        \
+    }
+// NOLINTNEXTLINE
+#define MIGRAPHX_FP8_GEN_OP_OVERLOADS(T) \
+    MIGRAPHX_FP8_BINARY_OP(*, T, T)      \
+    MIGRAPHX_FP8_BINARY_OP(-, T, T)      \
+    MIGRAPHX_FP8_BINARY_OP(/, T, T)      \
+    MIGRAPHX_FP8_BINARY_OP(+, T, T)      \
+    MIGRAPHX_FP8_BINARY_OP(>=, T, bool)  \
+    MIGRAPHX_FP8_BINARY_OP(<=, T, bool)  \
+    MIGRAPHX_FP8_BINARY_OP(!=, T, bool)  \
+    MIGRAPHX_FP8_OTHER_OPS(T)
+MIGRAPHX_FP8_GEN_OP_OVERLOADS(fp8e5m2)
+MIGRAPHX_FP8_GEN_OP_OVERLOADS(fp8e5m2fnuz)
+MIGRAPHX_FP8_GEN_OP_OVERLOADS(fp8e4m3fn)
+MIGRAPHX_FP8_GEN_OP_OVERLOADS(fp8e4m3fnuz)
+template <>
+class numeric_limits<fp8e4m3fnuz>
+{
+    public:
+    static constexpr bool has_infinity = false;
+    static constexpr __device__ fp8e4m3fnuz epsilon()
+    {
+        return fp8e4m3fnuz(0x28, fp8e4m3fnuz::from_bits());
+    }
+    // NOLINTNEXTLINE
+    static constexpr __device__ fp8e4m3fnuz quiet_NaN()
+    {
+        return fp8e4m3fnuz(0x80, fp8e4m3fnuz::from_bits());
+    }
+    static constexpr __device__ fp8e4m3fnuz max()
+    {
+        return fp8e4m3fnuz(0x7F, fp8e4m3fnuz::from_bits());
+    }
+    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01
+    static constexpr __device__ fp8e4m3fnuz min()
+    {
+        return fp8e4m3fnuz(0x08, fp8e4m3fnuz::from_bits());
+    }
+    static constexpr __device__ fp8e4m3fnuz lowest()
+    {
+        return fp8e4m3fnuz(0xFF, fp8e4m3fnuz::from_bits());
+    }
+};
+template <>
+class numeric_limits<fp8e4m3fn>
+{
+    public:
+    static constexpr bool has_infinity = false;
+    static constexpr __device__ fp8e4m3fn epsilon()
+    {
+        return fp8e4m3fn(0x20, fp8e4m3fn::from_bits());
+    }
+    // NOLINTNEXTLINE
+    static constexpr __device__ fp8e4m3fn quiet_NaN()
+    {
+        return fp8e4m3fn(0x7F, fp8e4m3fn::from_bits());
+    }
+    static constexpr __device__ fp8e4m3fn max() { return fp8e4m3fn(0x7E, fp8e4m3fn::from_bits()); }
+    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01
+    static constexpr __device__ fp8e4m3fn min() { return fp8e4m3fn(0x08, fp8e4m3fn::from_bits()); }
+    static constexpr __device__ fp8e4m3fn lowest()
+    {
+        return fp8e4m3fn(0xFE, fp8e4m3fn::from_bits());
+    }
+};
+template <>
+class numeric_limits<fp8e5m2fnuz>
+{
+    public:
+    static constexpr bool has_infinity = false;
+    static constexpr __device__ fp8e5m2fnuz epsilon()
+    {
+        return fp8e5m2fnuz(0x34, fp8e5m2fnuz::from_bits());
+    }
+    static constexpr __device__ fp8e5m2fnuz quiet_NaN() // NOLINT
+    {
+        return fp8e5m2fnuz(0x80, fp8e5m2fnuz::from_bits());
+    }
+    static constexpr __device__ fp8e5m2fnuz max()
+    {
+        return fp8e5m2fnuz(0x7F, fp8e5m2fnuz::from_bits());
+    }
+    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01.
+    static constexpr __device__ fp8e5m2fnuz min()
+    {
+        return fp8e5m2fnuz(0x4, fp8e5m2fnuz::from_bits());
+    }
+    static constexpr __device__ fp8e5m2fnuz lowest()
+    {
+        return fp8e5m2fnuz(0xFF, fp8e5m2fnuz::from_bits());
+    }
+};
+template <>
+class numeric_limits<fp8e5m2>
+{
+    public:
+    static constexpr bool has_infinity = true;
+    static constexpr __device__ fp8e5m2 epsilon() { return fp8e5m2(0x34, fp8e5m2::from_bits()); }
+    // 7D, 7E, 7F are positive NaNs and FD, FE, FF are negative NaNs
+    static constexpr __device__ fp8e5m2 quiet_NaN() // NOLINT
+    {
+        return fp8e5m2(0xFF, fp8e5m2::from_bits());
+    }
+    static constexpr __device__ fp8e5m2 max() { return fp8e5m2(0x7B, fp8e5m2::from_bits()); }
+    // this is min value that is not DeNormalized(DeNorm). DeNorm min is 0x01.
+    static constexpr __device__ fp8e5m2 min() { return fp8e5m2(0x4, fp8e5m2::from_bits()); }
+    static constexpr __device__ fp8e5m2 lowest() { return fp8e5m2(0xFB, fp8e5m2::from_bits()); }
+    // 7C and FC both are infinity
+    static constexpr __device__ fp8e5m2 infinity() { return fp8e5m2(0x7C, fp8e5m2::from_bits()); }
+};
+} // namespace fp8
+template <class T,
+          MIGRAPHX_REQUIRES(is_same<T, fp8::fp8e4m3fnuz>{} or is_same<T, fp8::fp8e5m2fnuz>{} or
+                            is_same<T, fp8::fp8e4m3fn>{} or is_same<T, fp8::fp8e5m2>{})>
+constexpr T numeric_max(migraphx::fp8::f8_type unused = migraphx::fp8::f8_type::fp8)
+{
+    // unused parameter is added to make this numeric_max different overload definition
+    // compared to numeric_max defined in type_traits.hpp
+    (void)(unused);
+    return fp8::numeric_limits<T>::max();
+}
+template <class T,
+          MIGRAPHX_REQUIRES(is_same<T, fp8::fp8e4m3fnuz>{} or is_same<T, fp8::fp8e5m2fnuz>{} or
+                            is_same<T, fp8::fp8e4m3fn>{} or is_same<T, fp8::fp8e5m2>{})>
+constexpr T numeric_lowest(migraphx::fp8::f8_type unused = migraphx::fp8::f8_type::fp8)
+{
+    // unused parameter is added to make this numeric_lowest different overload definition
+    // compared to numeric_lowest defined in type_traits.hpp
+    (void)(unused);
+    return fp8::numeric_limits<T>::lowest();
+}
+} // namespace migraphx
+// =================================================================================================
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif // __clang__
+#endif // MIGRAPHX_GUARD_KERNELS_FLOAT8_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/float8_impl.hpp
+/* ************************************************************************
+ * Copyright (C) 2016-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
+ * ies of the Software, and to permit persons to whom the Software is furnished
+ * to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
+ * PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
+ * CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ************************************************************************ */
+#ifndef MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#define MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
+#include <migraphx/kernels/bit_cast.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+namespace migraphx {
+namespace fp8 {
+namespace impl {
+// NOLINTBEGIN
+template <int Wm, int We, typename T, bool NegativeZeroNan, bool Clip>
+__device__ constexpr uint8_t cast_to_f8(T f_x, bool stoch = false, uint32_t rng = 0)
+{
+    constexpr bool is_float = true;
+    // half is not supported for now
+    constexpr bool is_half = false;
+    static_assert(Wm + We == 7, "Wm+We==7");
+    static_assert(is_float or is_half, "Only float can be cast to f8");
+    const uint32_t mfmt = (sizeof(T) == 4) ? 23 : 10;
+    typename migraphx::conditional_t<sizeof(T) == 2, uint16_t, uint32_t> x;
+    if constexpr(sizeof(T) == 4)
+        x = migraphx::bit_cast<uint32_t>(f_x);
+    else
+        x = migraphx::bit_cast<uint16_t>(f_x);
+    uint32_t head     = 0;
+    uint32_t mantissa = 0;
+    int exponent      = 0;
+    uint32_t bias     = 0;
+    uint32_t sign     = 0;
+    if constexpr(sizeof(T) == 4)
+    {
+        head     = x & 0xFF800000;
+        mantissa = x & 0x7FFFFF;
+        exponent = (head >> 23) & 0xFF;
+        sign     = head >> 31;
+        bias     = 127;
+    }
+    else
+    {
+        head     = x & 0xFC00;
+        mantissa = x & 0x3FF;
+        exponent = (head >> 10) & 0x1F;
+        sign     = head >> 15;
+        bias     = 15;
+    }
+    uint32_t signed_inf      = (sign << 7) + (((1 << We) - 1) << Wm);
+    uint32_t signed_all_ones = (sign << 7) + ((((1 << We) - 1) << Wm) + ((1 << Wm) - 1));
+    // Calcualte maximum singed value FLT_MAX, FLT_MIN
+    uint32_t signed_max = signed_all_ones;
+    if(not NegativeZeroNan)
+        signed_max = (Wm == 2) ? (signed_max - 4) : (signed_max - 1);
+    // Deal with inf and NaNs
+    if(NegativeZeroNan) // For the FNUZ cases, it is simple just return NaNs
+    {
+        if((sizeof(T) == 4 and ((x & 0x7F800000) == 0x7F800000)) or
+           (sizeof(T) == 2 and ((x & 0x7C00) == 0x7C00)))
+            return 0x80;
+    }
+    else
+    {
+        // calculate most common NaN mantissa for FP8, which is all Ones in binary
+        uint32_t nan_mantissa = 1;
+        for(auto i = 1; i < Wm; ++i)
+        {
+            nan_mantissa |= (nan_mantissa << 1);
+        }
+        if((sizeof(T) == 4 and ((x & 0x7F800000) == 0x7F800000)) or
+           (sizeof(T) == 2 and ((x & 0x7C00) == 0x7C00)))
+        {
+            // infinity
+            if(mantissa == 0)
+            {
+                if(sign == 0)
+                    return (Wm == 2) ? 0x7B : 0x7E;
+                else
+                    return (Wm == 2) ? 0xFB : 0xFE;
+            }
+            else // NaNs
+                return signed_inf + nan_mantissa;
+        }
+    }
+    // handle positive zero
+    if(x == 0)
+        return 0;
+    // handle negative zero
+    else if((sizeof(T) == 4 and x == 0x80000000) or (sizeof(T) == 2 and x == 0x8000))
+    {
+        return NegativeZeroNan ? 0 : 0x80; // For FNUZ types neg zero is just positive zero
+    }
+    /* First need to check if it is normal or denorm as there is a difference of implict 1
+    Then need to adjust the exponent to align with the F8 exponent, in the meanwhile, shift
+    The mantissa. Then for stochastic rounding, add rng to mantissa and truncate. And for
+    RNE, no need to add rng. Then probably need to check whether there is carry and adjust
+    exponent and mantissa again*/
+    // For IEEE bias mode, the bias is 2^(k-1) -1 where k is the width of exponent bits
+    const int f8_bias                  = (1 << (We - 1u)) - 1 + (NegativeZeroNan ? 1 : 0);
+    const int f8_denormal_act_exponent = 1 - f8_bias; // actual exponent of f8 denormal
+    /* act_exponent is the actual exponent of fp32/fp16 (after subtracting bias)
+    f8_exponent is the converted f8 exponent with bias encoding
+    exponent_diff is the diff between fp32/fp16 exponent and f8 exponent,
+    the difference needs to be adjusted and mantissa shifted*/
+    int act_exponent  = 0;
+    int f8_exponent   = 0;
+    int exponent_diff = 0;
+    if(exponent == 0 and mantissa != 0)
+    { // fp32/fp16 is in denormal.
+        /* fp32 denormal is below 2^-127 so it is usually not a concern here, we mostly concern fp16
+        here. In this case, f8 is usually in denormal. But there could be exceptions. fp16 denormal
+        has exponent bias 15 while bf8 with FNUZ has exponent bias 16. It means that there are some
+        numbers in fp16 denormal but they are bf8 (FNUZ) normals - smallest bf8 (FNUZ) normal is
+        2^-15. fp16 numbers where exponent==0 (actual exponent -14) and highest bit of mantissa is 1
+        are bf8 (FNUZ) normal. In this case, the fp16 mantissa should be shift left by 1  */
+        act_exponent  = 1 - bias;
+        exponent_diff = f8_denormal_act_exponent -
+                        act_exponent; // actual exponent is exponent-bias+1 as it is denormal
+    }
+    else
+    { // fp32/fp16 is normal with implicit 1
+        act_exponent = exponent - bias;
+        if(act_exponent <= f8_denormal_act_exponent)
+        {
+            /* This is the case where fp32/fp16 is normal but it is in f8 denormal range.
+            For example fp8 FNUZ mode, denormal exponent is -7, but if the fp32/fp16
+            actual exponent is -7, it is actually larger due to the implict 1,
+            Therefore it needs to be adjust to -6 and mantissa shift right by 1.
+            So for fp32/fp16, exponent -8 is the cut point to convert to fp8 FNUZ */
+            exponent_diff = f8_denormal_act_exponent - act_exponent;
+        }
+        else
+        {          // both fp32/fp16 and f8 are in normal range
+            exponent_diff =
+                0; // exponent_diff=0 does not mean there is no difference for this case,
+            // act_exponent could be larger. Just that it does not need shift mantissa
+        }
+        mantissa += (1 << mfmt); // Add the implicit 1 into mantissa
+    }
+    // need to know whether the number is right in the middle of two adjacent fp8 numbers. use  max
+    // value of 31 to avoid undefined behaviour
+    bool midpoint = (mantissa & ((1u << (mfmt - Wm + exponent_diff)) - 1)) ==
+                    (1u << (mfmt - Wm + exponent_diff - 1));
+    /* This part is a bit tricky. The judgment of whether it is a tie needs to be done before we
+    shift right as shift right could rip off some residual part and make something not midpoint look
+    like midpoint. For example, the fp16 number 0x1002 (0 00100 0000000010), it is larger than
+    midpoint, but after shift right by 4 bits, it would look like midpoint.
+    */
+    if(exponent_diff > 0)
+        mantissa >>= exponent_diff;
+    else if(exponent_diff == -1)
+        mantissa <<= -exponent_diff;
+    bool implicit_one = mantissa & (1 << mfmt);
+    // if there is no implict 1, it  means the f8 is denormal and need to adjust to denorm exponent
+    f8_exponent =
+        (act_exponent + exponent_diff) /*actual f8 exponent*/ + f8_bias - (implicit_one ? 0 : 1);
+    // Now we have the exponent and mantissa adjusted
+    uint32_t drop_mask = (1 << (mfmt - Wm)) - 1;
+    bool odd =
+        mantissa & (1 << (mfmt - Wm)); // if the least significant bit that is not truncated is 1
+    /*
+    This part is doing rounding by adding mantissa part that is going to get dropped.
+    e.g. if the dropped part for less than 0.5 than it would round down.
+    if the dropped part is more than 0.5 then it would round up by rolling carry to LSB of retained
+    mantissa.
+    For the mid point when bit pattern is like this for Odd: `xy1:10000000` for Odd and
+    `xy0:10000000` for the Even.  where `:` is delimiter for dropped v/s retained part.
+    For the odd case :
+    this will add xy1:10000000 + 000:10000000 which would roll over carry to LSB of retained
+    part making it RNE.
+    For the even case : this will add xy0:10000000 + 000:01111111 which would
+    round down and keep number Even
+    */
+    mantissa += (stoch ? rng : (midpoint ? (odd ? mantissa : mantissa - 1) : mantissa)) & drop_mask;
+    // Now we deal with overflow
+    if(f8_exponent == 0 and ((1 << mfmt) & mantissa))
+    {
+        f8_exponent = 1; // denormal overflow to become normal, promote exponent
+    }
+    else if((1 << (mfmt + 1)) & mantissa)
+    {
+        mantissa >>= 1;
+        f8_exponent++;
+    }
+    mantissa >>= (mfmt - Wm);
+    // above range: quantize to maximum possible float of the same sign
+    // for e5m2 case, max_exp is 14, since exp = 15 is reserved for Infs and Nans
+    const int max_exp = (1 << We) - ((NegativeZeroNan or Wm == 3) ? 1 : 2);
+    if(f8_exponent > max_exp)
+    {
+        if(Clip)
+            return signed_max;
+        else
+        {
+            // https://onnx.ai/onnx/technical/float8.html#cast
+            if(NegativeZeroNan)
+                return 0x80;
+            else
+                return (Wm == 2) ? signed_inf : signed_all_ones;
+        }
+    }
+    if(f8_exponent == 0 and mantissa == 0)
+        return NegativeZeroNan ? 0 : (sign << 7);
+    mantissa &= (1 << Wm) - 1;
+    return (sign << 7) | (f8_exponent << Wm) | mantissa;
+}
+// NOLINTEND
+template <int Wm, int We, typename T, bool NegativeZeroNan>
+__device__ constexpr T cast_from_f8(uint8_t x)
+{
+    // half is not supported for now
+    constexpr bool is_half  = false;
+    constexpr bool is_float = true;
+    static_assert(is_float or is_half, "Only float are supported");
+    constexpr int weo = is_half ? 5 : 8;
+    constexpr int wmo = is_half ? 10 : (is_float ? 23 : 7);
+    // NOLINTNEXTLINE
+    T f_inf, f_neg_inf, f_nan, f_neg0;
+    if constexpr(is_float)
+    {
+        const uint32_t if_inf     = 0x7F800000;
+        const uint32_t if_neg_inf = 0xFF800000;
+        const uint32_t if_nan     = 0x7F800001;
+        const uint32_t if_neg0    = 0x80000000;
+        f_inf                     = migraphx::bit_cast<float>(if_inf);
+        f_neg_inf                 = migraphx::bit_cast<float>(if_neg_inf);
+        f_nan                     = migraphx::bit_cast<float>(if_nan);
+        f_neg0                    = migraphx::bit_cast<float>(if_neg0);
+    }
+    if(x == 0)
+        return 0;
+    uint32_t sign     = x >> 7;              // NOLINT
+    uint32_t mantissa = x & ((1 << Wm) - 1); // NOLINT
+    int exponent      = (x & 0x7F) >> Wm;    // NOLINT
+    if(NegativeZeroNan)
+    {
+        if(x == 0x80)
+            return f_nan;
+    }
+    else
+    {
+        if(x == 0x80)
+            return f_neg0;
+        if(exponent == ((1 << We) - 1) and Wm == 2) // NOLINT
+            return (mantissa == 0) ? (sign ? f_neg_inf : f_inf) : f_nan;
+        else if(Wm == 3 and (x == 0x7F or x == 0xFF))
+            return f_nan;
+    }
+    typename migraphx::conditional_t<sizeof(T) == 2, uint16_t, uint32_t> retval;
+    const int exp_low_cutoff =
+        (1 << (weo - 1)) - (1 << (We - 1)) + 1 - (NegativeZeroNan ? 1 : 0); // NOLINT
+    // subnormal input
+    if(exponent == 0)
+    {
+        // guaranteed mantissa!=0 since cases 0x0 and 0x80 are handled above
+        int sh = 1 + __builtin_clz(mantissa) - (32 - Wm);
+        mantissa <<= sh;             // NOLINT
+        exponent += 1 - sh;
+        mantissa &= ((1 << Wm) - 1); // NOLINT
+    }
+    exponent += exp_low_cutoff - 1;
+    mantissa <<= wmo - Wm; // NOLINT
+    // subnormal output (occurs when T=half, We=5, negative_zero_nan=true)
+    if(exponent <= 0)
+    {
+        mantissa |= 1 << wmo;      // NOLINT
+        mantissa >>= 1 - exponent; // NOLINT
+        exponent = 0;
+    }
+    if(sizeof(T) == 2)
+        retval = (sign << 15) | (exponent << 10) | mantissa; // NOLINT
+    else
+        retval = (sign << 31) | (exponent << 23) | mantissa; // NOLINT
+    return migraphx::bit_cast<T>(retval);
+}
+} // namespace impl
+} // namespace fp8
+} // namespace migraphx
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+#endif // MIGRAPHX_GUARD_KERNELS_FP8_IMPL_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
@@ -53,35 +53,35 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
    auto indices_shape_lens = indices_shape.lens;
    auto data_shape_lens    = data_shape.lens;
    auto num_slice_dims     = indices_shape_lens.back();
-    std::size_t num_slices =
+    size_t num_slices =
        accumulate(indices_shape_lens.begin(), indices_shape_lens.end() - 1, 1, op::product{});
-    std::size_t slice_size = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
+    size_t slice_size = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
-                                        data_shape_lens.end(),
+                                   data_shape_lens.end(),
-                                        1,
+                                   1,
-                                        op::product{});
+                                   op::product{});
-    const std::size_t num_batches =
+    const size_t num_batches =
        accumulate(data_shape_lens.begin(), data_shape_lens.begin() + batch_dims, 1, op::product{});
-    const std::size_t data_batch_stride =
+    const size_t data_batch_stride =
        accumulate(data_shape_lens.begin() + batch_dims, data_shape_lens.end(), 1, op::product{});
    const auto num_slices_per_batch = num_slices / num_batches;
    ind.global_stride(output_shape.elements(), [&](auto i) {
        const auto* indices_ptr     = indices_t.data();
-        const std::size_t j         = i / slice_size;
+        const size_t j              = i / slice_size;
-        const std::size_t batch_idx = j / num_slices_per_batch;
+        const size_t batch_idx      = j / num_slices_per_batch;
        auto* slice_indices               = indices_ptr + (j * num_slice_dims);
-        std::size_t relative_slice_offset = 0;
+        size_t relative_slice_offset      = 0;
-        for(std::size_t idx = 0; idx < num_slice_dims; ++idx)
+        for(size_t idx = 0; idx < num_slice_dims; ++idx)
        {
            int64_t index                   = slice_indices[idx];
-            const std::size_t input_dim_idx = batch_dims + idx;
+            const size_t input_dim_idx      = batch_dims + idx;
            const auto input_dim            = data_shape_lens[input_dim_idx];
            MIGRAPHX_ASSERT(index >= -static_cast<int64_t>(input_dim) and
                            index < static_cast<int64_t>(input_dim));
            if(index < 0)
                index += input_dim;
-            std::size_t size_from_slice_dims =
+            size_t size_from_slice_dims =
                accumulate(data_shape_lens.begin() + batch_dims + idx + 1,
                           data_shape_lens.begin() + batch_dims + num_slice_dims,
                           slice_size,

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -52,22 +52,25 @@ __device__ void generic_binary_layernorm(
    block::template run<reduce_output>([&](auto, auto r) {
        auto input       = r.inner([&](auto x1, auto x2) { return op(x1, x2); })(input1, input2);
        using value_type = typename Input1::type;
+        using vec_value_type       = vec_type<value_type>;
        constexpr auto relements   = r.template elements<Input1>();
-        constexpr auto relements_r = vec_type<value_type>{1.0 / relements};
+        constexpr auto relements_r = vec_value_type{1.0 / relements};
        auto relements_rsqrt       = sqrt(relements_r);
-        auto means = r.reduce(op::sum{}, make_array<vec_type<value_type>>(0, 0), [&](auto x) {
+        auto means = r.reduce(op::sum{},
-            auto x_out = x * relements_r;
+                              make_array<vec_value_type>(vec_value_type{0}, vec_value_type{0}),
-            // dividing x by sqrt(relements) before squaring allows computing higher values
+                              [&](auto x) {
-            // before overflow in low precision
+                                  auto x_out = x * relements_r;
-            auto x2_sqrt = x * relements_rsqrt;
+                                  // dividing x by sqrt(relements) before squaring allows computing
-            return make_array(x_out, x2_sqrt * x2_sqrt);
+                                  // higher values before overflow in low precision
-        })(input);
+                                  auto x2_sqrt = x * relements_rsqrt;
+                                  return make_array(x_out, x2_sqrt * x2_sqrt);
+                              })(input);
        auto mean_x        = means[0];
        auto mean_x2       = means[1];
        auto variance      = mean_x2 - (mean_x * mean_x);
-        value_type eps_val = eps; // implicit conversion for eps
+        value_type eps_val = implicit_conversion(eps);
        r.inner([&](auto& y, auto x, auto... xs) {
            auto m = x - mean_x;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -29,11 +29,15 @@
 #include <migraphx/kernels/functional.hpp>
 #include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/hip.hpp>
+#include <migraphx/kernels/float8.hpp>
 namespace migraphx {
 namespace math {
 constexpr float as_float(migraphx::half x) { return x; }
+constexpr float as_float(migraphx::fp8::fp8e4m3fnuz x) { return x; }
 template <class T>
 constexpr T as_float(T x)
 {
@@ -57,14 +61,14 @@ constexpr T as_float(T x)
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_FOR(type, name, fname)                    \
    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())> \
-    auto __device__ name(type x, Ts... xs)->type                       \
+    auto __device__ name(type x, Ts... xs) -> type                     \
    {                                                                  \
        return fname(x, xs...);                                        \
    }
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_BINARY_FOR(type, name, fname) \
-    inline auto __device__ name(type x, type y)->type { return fname(x, y); }
+    inline auto __device__ name(type x, type y) -> type { return fname(x, y); }
 // NOLINTNEXTLINE
 #define MIGRAPHX_DEVICE_MATH_HALF(name, fname)                         \
@@ -72,6 +76,12 @@ constexpr T as_float(T x)
    auto __device__ name(migraphx::half x, Ts... xs)                   \
        MIGRAPHX_RETURNS(fname(math::as_float(x), math::as_float(xs)...))
+// NOLINTNEXTLINE
+#define MIGRAPHX_DEVICE_MATH_FP8(name, fname)                                      \
+    template <class... Ts, MIGRAPHX_REQUIRES(not is_any_vec<Ts...>())>             \
+    auto __device__ name(migraphx::fp8::fp8e4m3fnuz x, Ts... xs) MIGRAPHX_RETURNS( \
+        migraphx::fp8::fp8e4m3fnuz(fname(math::as_float(x), math::as_float(xs)...)))
 // Template with two overloads for math functions, one for half2 type and one for more generic
 // <half, N> vectorization where N is 4 or another even number.
@@ -101,7 +111,9 @@ MIGRAPHX_DEVICE_MATH(erf, ::erf)
 MIGRAPHX_DEVICE_MATH(exp, ::exp)
 MIGRAPHX_DEVICE_MATH(floor, ::floor)
 MIGRAPHX_DEVICE_MATH(isnan, ::isnan)
+MIGRAPHX_DEVICE_MATH(isinf, ::isinf)
 MIGRAPHX_DEVICE_MATH(log, ::log)
+MIGRAPHX_DEVICE_MATH(nearbyint, ::nearbyint)
 MIGRAPHX_DEVICE_MATH(pow, ::pow)
 MIGRAPHX_DEVICE_MATH(remainder, ::remainder)
 MIGRAPHX_DEVICE_MATH(round, ::round)
@@ -135,6 +147,7 @@ MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, ceil, ::hceil)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, cos, ::hcos)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, exp, ::hexp)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, floor, ::hfloor)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, isinf, ::__hisinf)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, isnan, ::__hisnan)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, log, ::hlog)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, rsqrt, ::hrsqrt)
@@ -150,6 +163,7 @@ MIGRAPHX_DEVICE_MATH_HALF(atan, ::atan)
 MIGRAPHX_DEVICE_MATH_HALF(atanh, ::atanh)
 MIGRAPHX_DEVICE_MATH_HALF(cosh, ::cosh)
 MIGRAPHX_DEVICE_MATH_HALF(erf, ::erf)
+MIGRAPHX_DEVICE_MATH_HALF(nearbyint, ::nearbyint)
 MIGRAPHX_DEVICE_MATH_HALF(pow, ::pow)
 MIGRAPHX_DEVICE_MATH_HALF(remainder, ::remainder)
 MIGRAPHX_DEVICE_MATH_HALF(round, ::round)
@@ -158,6 +172,33 @@ MIGRAPHX_DEVICE_MATH_HALF(tan, ::tan)
 MIGRAPHX_DEVICE_MATH_HALF(tanh, ::tanh)
 MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)
+// use float to compute fp8 overload
+MIGRAPHX_DEVICE_MATH_FP8(abs, ::abs)
+MIGRAPHX_DEVICE_MATH_FP8(acos, ::acos)
+MIGRAPHX_DEVICE_MATH_FP8(acosh, ::acosh)
+MIGRAPHX_DEVICE_MATH_FP8(asin, ::asin)
+MIGRAPHX_DEVICE_MATH_FP8(asinh, ::asinh)
+MIGRAPHX_DEVICE_MATH_FP8(atan, ::atan)
+MIGRAPHX_DEVICE_MATH_FP8(atanh, ::atanh)
+MIGRAPHX_DEVICE_MATH_FP8(ceil, ::ceil)
+MIGRAPHX_DEVICE_MATH_FP8(cos, ::cos)
+MIGRAPHX_DEVICE_MATH_FP8(cosh, ::cosh)
+MIGRAPHX_DEVICE_MATH_FP8(erf, ::erf)
+MIGRAPHX_DEVICE_MATH_FP8(exp, ::exp)
+MIGRAPHX_DEVICE_MATH_FP8(floor, ::floor)
+MIGRAPHX_DEVICE_MATH_FP8(isnan, ::isnan)
+MIGRAPHX_DEVICE_MATH_FP8(log, ::log)
+MIGRAPHX_DEVICE_MATH_FP8(pow, ::pow)
+MIGRAPHX_DEVICE_MATH_FP8(remainder, ::remainder)
+MIGRAPHX_DEVICE_MATH_FP8(round, ::round)
+MIGRAPHX_DEVICE_MATH_FP8(rsqrt, ::rsqrt)
+MIGRAPHX_DEVICE_MATH_FP8(sin, ::sin)
+MIGRAPHX_DEVICE_MATH_FP8(sinh, ::sinh)
+MIGRAPHX_DEVICE_MATH_FP8(sqrt, ::sqrt)
+MIGRAPHX_DEVICE_MATH_FP8(tan, ::tan)
+MIGRAPHX_DEVICE_MATH_FP8(tanh, ::tanh)
+MIGRAPHX_DEVICE_MATH_FP8(fmod, ::fmod)
 // Map math functions to hip half2 functions
 // The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
 // packed into a 32-bit number.  See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
@@ -229,10 +270,12 @@ MIGRAPHX_DEVICE_MATH_VEC(erf)
 MIGRAPHX_DEVICE_MATH_VEC(exp)
 MIGRAPHX_DEVICE_MATH_VEC(floor)
 MIGRAPHX_DEVICE_MATH_VEC(fmod)
+MIGRAPHX_DEVICE_MATH_VEC(isinf)
 MIGRAPHX_DEVICE_MATH_VEC(isnan)
 MIGRAPHX_DEVICE_MATH_VEC(log)
 MIGRAPHX_DEVICE_MATH_VEC(max)
 MIGRAPHX_DEVICE_MATH_VEC(min)
+MIGRAPHX_DEVICE_MATH_VEC(nearbyint)
 MIGRAPHX_DEVICE_MATH_VEC(pow)
 MIGRAPHX_DEVICE_MATH_VEC(remainder)
 MIGRAPHX_DEVICE_MATH_VEC(round)
@@ -247,7 +290,7 @@ MIGRAPHX_DEVICE_MATH_VEC(where)
 template <class T, class U>
 constexpr auto convert(U v)
 {
-    return vec_transform(v)([](auto x) -> T { return x; });
+    return vec_transform(v)([](auto x) -> T { return static_cast<T>(x); });
 }
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pad.hpp
@@ -28,6 +28,7 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/algorithm.hpp>
 #include <migraphx/kernels/ranges.hpp>
+#include <migraphx/kernels/vec.hpp>
 namespace migraphx {
@@ -53,9 +54,9 @@ __device__ void pad(const index& idx,
        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
           }))
-            output[multi] = pad_val;
+            output[multi] = implicit_conversion(pad_val);
        else
-            output[multi] = input[input_idx];
+            output[multi] = implicit_conversion(input[input_idx]);
    });
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -45,7 +45,10 @@ __device__ void dpp_reduce(T& in, Op op)
    in  = op(in, out);
    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
    in  = op(in, out);
-#if __AMDGCN_WAVEFRONT_SIZE == 64
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+    out = dpp_swizzle<0x1e0>(in);
+    in  = op(in, out);
+#else
    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
    in  = op(in, out);
    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
@@ -54,9 +57,11 @@ __device__ void dpp_reduce(T& in, Op op)
 }
 #if defined(MIGRAPHX_USE_CLANG_TIDY) || defined(CPPCHECK)
 // NOLINTNEXTLINE
-#define MIGRAPHX_DPP_REDUCE_ASM(x, ins) x = 1
+#define MIGRAPHX_DPP_REDUCE_ASM(x, ins, f) \
+    (void)f;                               \
+    x = 1
 #elif __AMDGCN_WAVEFRONT_SIZE == 64
-#define MIGRAPHX_DPP_REDUCE_ASM(x, ins)                                       \
+#define MIGRAPHX_DPP_REDUCE_ASM(x, ins, f)                                    \
    __asm__ volatile("s_nop 4\n" #ins " %0 %0 %0 row_shr:1\n"                 \
                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:2\n"                 \
                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:4 bank_mask:0xe\n"   \
@@ -65,29 +70,42 @@ __device__ void dpp_reduce(T& in, Op op)
                     "s_nop 1\n" #ins " %0 %0 %0 row_bcast:31 row_mask:0xc\n" \
                     "s_nop 1\n"                                              \
                     : "=v"(x)                                                \
-                     : "0"(x))
+                     : "0"(x));                                               \
+    (void)f
 #else
-#define MIGRAPHX_DPP_REDUCE_ASM(x, ins)                                     \
+#define MIGRAPHX_DPP_REDUCE_ASM(x, ins, f)                                  \
    __asm__ volatile("s_nop 4\n" #ins " %0 %0 %0 row_shr:1\n"               \
                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:2\n"               \
                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:4 bank_mask:0xe\n" \
                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:8 bank_mask:0xc\n" \
-                     "s_nop 1\n"                                            \
-                     "s_nop 1\n"                                            \
                     : "=v"(x)                                              \
-                     : "0"(x))
+                     : "0"(x));                                             \
+    auto y = dpp_swizzle<0x1e0>(x);                                         \
+    x      = f(x, y)
 #endif
 // NOLINTNEXTLINE
-#define MIGRAPHX_DPP_REDUCE(op, prefix, sign)                                                      \
+#define MIGRAPHX_DPP_REDUCE(op, prefix, sign)            \
-    __device__ inline void dpp_reduce(double& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f64); } \
+    __device__ inline void dpp_reduce(double& x, op f)   \
-    __device__ inline void dpp_reduce(float& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f32); }  \
+    {                                                    \
-    __device__ inline void dpp_reduce(half& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f16); }   \
+        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f64, f);     \
-    __device__ inline void dpp_reduce(int32_t& x, op)                                              \
+    }                                                    \
-    {                                                                                              \
+    __device__ inline void dpp_reduce(float& x, op f)    \
-        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##sign##32);                                              \
+    {                                                    \
-    }                                                                                              \
+        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f32, f);     \
-    __device__ inline void dpp_reduce(uint32_t& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); }
+    }                                                    \
+    __device__ inline void dpp_reduce(half& x, op f)     \
+    {                                                    \
+        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f16, f);     \
+    }                                                    \
+    __device__ inline void dpp_reduce(int32_t& x, op f)  \
+    {                                                    \
+        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##sign##32, f); \
+    }                                                    \
+    __device__ inline void dpp_reduce(uint32_t& x, op f) \
+    {                                                    \
+        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32, f);     \
+    }
 // Note: when max and min are in int32_t, signed version of instruction needs to be used.
 MIGRAPHX_DPP_REDUCE(op::sum, v_add, _u)
@@ -99,14 +117,10 @@ template <class Op, class T, class Index, class F>
 __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 {
    MIGRAPHX_ASSERT(idx.max_nlocal() == idx.nlocal());
-#if __AMDGCN_WAVEFRONT_SIZE == 32
+    constexpr index_int lanes_per_thread = __AMDGCN_WAVEFRONT_SIZE;
-    constexpr index_int lanes_per_thread = 16;
-#else
-    constexpr index_int lanes_per_thread = 64;
-#endif
    using type = decltype(index::invoke_loop(f, 0, _c<0>));
    __shared__ type buffer[idx.max_nlocal() / lanes_per_thread];
-    type x = init;
+    type x = type(init);
    idx.local_stride(n, [&](auto i, auto d) { x = op(x, index::invoke_loop(f, i, d)); });
    dpp_reduce(x, op);
@@ -117,7 +131,7 @@ __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
    }
    __syncthreads();
-    type y = init;
+    type y = type(init);
    for(index_int i = 0; i < idx.nlocal() / lanes_per_thread; i++)
    {
        y = op(y, buffer[i]);
@@ -244,9 +258,8 @@ struct reducer_base
        {
            auto&& derived = static_cast<const Derived&>(*this);
            auto t         = derived.slice(x);
-            return make_storage_access<typename decltype(t)::type>([=](auto i, auto...) -> auto& {
+            return make_storage_access<typename decltype(t)::type>(
-                return t[i];
+                [=](auto i, auto...) -> auto& { return t[i]; });
-            });
        }
    }
@@ -393,7 +406,7 @@ struct block
        {
            using max_iterations = decltype(idx.max_local_stride_iterations(n));
            inner_storage<R, max_iterations{}, N> storage;
-            idx.local_stride(n, [&](auto j, auto d) { storage(j, d) = f(xs(j, d)...); });
+            idx.local_stride(n, [&](auto j, auto d) { storage(j, d) = R{f(xs(j, d)...)}; });
            return storage;
        }
    };
@@ -482,7 +495,7 @@ struct lane
        __device__ auto reduce_impl(Op op, T init, Read read, N n, U&& x, Us&&... xs) const
        {
            using type = remove_reference_t<decltype(x(0, _c<0>))>;
-            type r     = init;
+            type r     = type(init);
            for(index_int j = 0; j < n; j++)
            {
                r = op(r, read(x(j, _c<0>), xs(j, _c<0>)...));

--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -62,7 +62,7 @@ struct avg_pool
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int y)
    {
-        return (y == 0) ? 0.0 : (x / y);
+        return (y == 0) ? T{0.0} : T{x / y};
    }
 };
@@ -76,7 +76,7 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
    {
        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
        {
-            return 0;
+            return implicit_conversion(0);
        }
        xy[ii]   = migraphx::max(xy[ii], 0.0f);
@@ -92,15 +92,16 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
                                high[0] * dims[1] + low[1],
                                high[0] * dims[1] + high[1]};
-    float ly                                   = xy[0] - low[0];
+    float ly = xy[0] - low[0];
-    float lx                                   = xy[1] - low[1];
+    float lx = xy[1] - low[1];
-    float hy                                   = 1.0f - ly;
+    float hy = 1.0f - ly;
-    float hx                                   = 1.0f - lx;
+    float hx = 1.0f - lx;
-    array<typename Iterator::value_type, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+    // do calculations in floating point and convert final result to required type
+    array<float, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
-    return pooling(v01, v23);
+    return implicit_conversion(pooling(v01, v23));
 }
 template <class Iterator, class Op>
@@ -113,8 +114,9 @@ MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
                                            float roi_offset,
                                            Op op)
 {
-    typename Iterator::value_type output_val = op.init();
+    using in_dtype      = typename Iterator::value_type;
-    const int64_t count                      = bin_grid_size[0] * bin_grid_size[1];
+    in_dtype output_val = in_dtype{op.init()};
+    const int64_t count = bin_grid_size[0] * bin_grid_size[1];
    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
        array<index_int, 2> id = {iy, ix};
        array<float, 2> locs =
@@ -148,7 +150,6 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
    const auto x    = x_t.begin();
    const auto rois = rois_t.begin();
    const auto ind  = ind_t.begin();
    // input shape
    auto x_lens      = x_t.get_shape().lens;
    auto channel_num = x_lens[1];
@@ -176,10 +177,12 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
        const auto offset_rois = rois + (n * roi_column_num);
        const int batch_ind    = ind[n];
-        array<float, 2> roi_starts = {offset_rois[1] * s.spatial_scale,
+        array<float, 2> roi_starts = {
-                                      offset_rois[0] * s.spatial_scale};
+            static_cast<float>(offset_rois[1]) * static_cast<float>(s.spatial_scale),
-        array<float, 2> roi_ends   = {offset_rois[3] * s.spatial_scale,
+            static_cast<float>(offset_rois[0]) * static_cast<float>(s.spatial_scale)};
-                                    offset_rois[2] * s.spatial_scale};
+        array<float, 2> roi_ends = {
+            static_cast<float>(offset_rois[3]) * static_cast<float>(s.spatial_scale),
+            static_cast<float>(offset_rois[2]) * static_cast<float>(s.spatial_scale)};
        array<float, 2> roi_size{};
        array<float, 2> bin_size{};

--- a/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/int8_gemm_pack.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -21,43 +21,63 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP
+#ifndef MIGRAPHX_GUARD_KERNELS_SCATTER_REDUCTION_MODES_HPP
-#define MIGRAPHX_GUARD_RTGLIB_INT8_GEMM_PACK_HPP
+#define MIGRAPHX_GUARD_KERNELS_SCATTER_REDUCTION_MODES_HPP
-#include <migraphx/argument.hpp>
+#include <migraphx/kernels/types.hpp>
-#include <migraphx/config.hpp>
-#include <utility>
 namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
+struct assign_none
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        x = y;
+    }
+};
-struct hip_int8_gemm_pack_a
+struct assign_add
 {
-    std::string name() const { return "gpu::int8_gemm_pack_a"; }
+    template <class T, class U>
-    shape compute_shape(const std::vector<shape>& inputs) const;
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
-    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
-        return shapes.size() - 1;
+        atomicAdd(&x, y);
    }
 };
-struct hip_int8_gemm_pack_b
+struct assign_mul
 {
-    std::string name() const { return "gpu::int8_gemm_pack_b"; }
+    template <class T, class U>
-    shape compute_shape(const std::vector<shape>& inputs) const;
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
-    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
-        return shapes.size() - 1;
+        T old = x;
+        T assumed;
+        do
+        {
+            assumed = old;
+            old     = atomicCAS(&x, assumed, assumed * y);
+        } while(assumed != old);
    }
 };
-} // namespace gpu
+struct assign_max
-} // namespace MIGRAPHX_INLINE_NS
+{
-} // namespace migraphx
+    template <typename T, typename U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        atomicMax(&x, y);
+    }
+};
+struct assign_min
+{
+    template <typename T, typename U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        atomicMin(&x, y);
+    }
+};
+} // namespace migraphx
 #endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/scatternd.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/scatternd.hpp
@@ -26,36 +26,10 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/scatter_reduction_modes.hpp>
 namespace migraphx {
-struct assign_none
-{
-    template <class T, class U>
-    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
-    {
-        x = y;
-    }
-};
-struct assign_add
-{
-    template <class T, class U>
-    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
-    {
-        x += y;
-    }
-};
-struct assign_mul
-{
-    template <class T, class U>
-    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
-    {
-        x *= y;
-    }
-};
 template <class T, class U, class V, class F>
 __device__ void scatternd(const T& indices_t, const U& updates_t, const V& output_t, F f)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
@@ -43,7 +43,7 @@ __device__ void softmax(Input input1, Output output)
        auto exp_in = r.inner([&](auto x) { return migraphx::exp(x - c); })(input);
        auto batch_sum =
            r.reduce(op::sum{}, 0, [](auto x) { return migraphx::convert<float>(x); })(exp_in);
-        r.inner([&](auto& y, auto x) { y = x / batch_sum; })(output, exp_in);
+        r.inner([&](auto& y, auto x) { y = implicit_conversion(x / batch_sum); })(output, exp_in);
    });
 }