Merge

9bff4331 · Paul · 214b313f · 94a7f6ee · 9bff4331 · 9bff4331
Commit 9bff4331 authored Mar 21, 2023 by Paul
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -98,6 +98,13 @@ struct hip_sync_stream
            return {};
        return args.front();
    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>& args) const
+    {
+        if(args.empty())
+            return -1;
+        return 0;
+    }
 };

 struct hip_copy_to_gpu

--- a/src/targets/gpu/include/migraphx/gpu/lowering.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/lowering.hpp
@@ -33,6 +33,13 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct module;

 namespace gpu {
+
+/**
+ * Compiler pass that makes GPU-specific instruction changes.
+ * * Copies to and from the device if `offload_copy` is true.
+ * * Maps instructions to their GPU-specific counterparts.
+ * * Inserts `allocate` instructions before GPU operators.
+ */
 struct lowering
 {
    context* ctx;

--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -75,12 +75,19 @@ using miopen_find_options = MIGRAPHX_MANAGE_PTR(miopenFindOptions_t, miopenDestr
 using miopen_problem      = MIGRAPHX_MANAGE_PTR(miopenProblem_t, miopenDestroyProblem);
 using miopen_solution     = MIGRAPHX_MANAGE_PTR(miopenSolution_t, miopenDestroySolution);

-inline miopen_solution find_solution(miopenHandle_t handle, miopenProblem_t problem)
+inline miopen_solution
+find_solution(miopenHandle_t handle, miopenProblem_t problem, bool tune = false)
 {
    miopenSolution_t solution;
-    size_t found = 0;
-    auto status  = miopenFindSolutions(handle, problem, nullptr, &solution, &found, 1);
-    auto result  = miopen_solution{solution};
+    size_t found           = 0;
+    miopen_find_options fo = nullptr;
+    if(tune)
+    {
+        fo = make_obj<miopen_find_options>(&miopenCreateFindOptions);
+        miopenSetFindOptionTuning(fo.get(), 1);
+    }
+    auto status = miopenFindSolutions(handle, problem, fo.get(), &solution, &found, 1);
+    auto result = miopen_solution{solution};
    if(status != miopenStatusSuccess or found == 0)
        MIGRAPHX_THROW("MIOpen miopenFindSolutions failed");
    return result;

--- a/src/targets/gpu/include/migraphx/gpu/name.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/name.hpp
@@ -56,7 +56,6 @@ struct oper
                return name.substr(pos_ns + 2);
            }
        }
-
        return "unknown_operator_name";
    }
 };

--- a/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/prefuse_ops.hpp
@@ -30,14 +30,14 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-struct module;
+struct module_pass_manager;

 namespace gpu {

 struct prefuse_ops
 {
    std::string name() const { return "gpu::prefuse_ops"; }
-    void apply(module& m) const;
+    void apply(module_pass_manager& mpm) const;
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/target.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/target.hpp
@@ -37,7 +37,6 @@ struct target
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& gctx, const compile_options& options) const;
    migraphx::context get_context() const;
-
    argument copy_to(const argument& arg) const;
    argument copy_from(const argument& arg) const;
    argument allocate(const shape& s) const;

--- a/src/targets/gpu/jit/gather.cpp
+++ b/src/targets/gpu/jit/gather.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/compiler.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/gpu/context.hpp>
+
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const gather_kernel = R"__migraphx__(
+#include <migraphx/kernels/gather.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/generic_constant.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+
+__global__ void gather_kernel(void* in_data, void* in_indices, void* output) 
+{
+    make_tensors()(in_data, in_indices, output)([](auto&&... xs) { 
+        gather<${axis}>(xs...); 
+    });
+}
+
+}
+
+} // namespace migraphx
+
+)__migraphx__";
+
+struct gather_compiler : compiler<gather_compiler>
+{
+    std::vector<std::string> names() const { return {"gather"}; }
+
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        const auto& out_s = inputs.back();
+        options.set_launch_params(v, compute_global_for(ctx, out_s.elements()));
+        options.inputs         = inputs;
+        options.output         = out_s;
+        options.kernel_name    = "gather_kernel";
+        options.virtual_inputs = inputs;
+
+        auto axis = v.at("axis").to<std::string>();
+
+        auto src = interpolate_string(gather_kernel, {{"axis", axis}});
+
+        return compile_hip_code_object(src, options);
+    }
+
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -24,7 +24,6 @@
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/gpu/context.hpp>
-
 #include <migraphx/gpu/mlir.hpp>

 namespace migraphx {

--- a/src/targets/gpu/jit/reduce.cpp
+++ b/src/targets/gpu/jit/reduce.cpp
@@ -118,17 +118,17 @@ struct reduce_compiler : compiler<reduce_compiler>
        options.virtual_inputs = reduce_dims(inputs);
        auto faxis             = find_fast_axis({options.virtual_inputs.front()});
        vectorize vec{};
-        // Vectorize if the axis is a reduction axis
-        if(options.virtual_inputs.back().lens()[faxis] == 1)
-        {
-            vec = vectorize::elements(ctx, faxis, options.virtual_inputs);
-        }
-        auto relements = get_reduce_elements(options.virtual_inputs) / vec.size;
        auto nelements = options.virtual_inputs.back().elements();
        auto algo      = v.get("algo", get_reduce_algo(options.virtual_inputs));
        if(algo == "block")
        {
+            // Vectorize if the axis is a reduction axis
+            if(options.virtual_inputs.back().lens()[faxis] == 1)
+                vec = vectorize::elements(ctx, faxis, options.virtual_inputs);
+            auto relements  = get_reduce_elements(options.virtual_inputs) / vec.size;
            auto block_size = compute_block_size(relements, 256);
+            if(relements >= block_size * 256)
+                algo = "block_large";
            options.set_launch_params(
                v, compute_global_for(ctx, nelements * block_size, 256), block_size);
        }
@@ -156,16 +156,25 @@ struct reduce_compiler : compiler<reduce_compiler>

    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        value v              = value::object{};
-        auto reduce_elements = get_reduce_elements(ins->inputs());
+        value v = value::object{};
        if(op.name() == "reduce_sum")
        {
            v["reduction"] = "op::sum{}";
        }
        else if(op.name() == "reduce_mean")
        {
-            v["reduction"] = "op::sum{}";
-            v["write"]     = "op::mean{" + std::to_string(reduce_elements) + "}";
+            auto reduce_elements = get_reduce_elements(ins->inputs());
+            auto reduce_type     = ins->inputs().front()->get_shape().type();
+            v["reduction"]       = "op::sum{}";
+            std::string mean     = "op::mean<" + std::to_string(reduce_elements) + ">{}";
+            // Use float accumulator when reduction size is too large for half
+            if(reduce_type == shape::half_type and reduce_elements > 16384)
+                v["read"] = "compose(" + mean + ", op::convert_to<float>{})";
+            else if(contains({shape::float_type, shape::half_type, shape::double_type},
+                             reduce_type))
+                v["read"] = mean;
+            else
+                v["write"] = mean;
        }
        else if(op.name() == "reduce_max")
        {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -105,7 +105,7 @@ constexpr auto array_for_each(T& x, Ts&... xs)
            }
            else
            {
-                using vec_type = std::remove_reference_t<decltype(array2vec(x))>;
+                using vec_type = remove_reference_t<decltype(array2vec(x))>;
                f(array2vec(x), __builtin_convertvector(array2vec(xs), vec_type)...);
            }
        }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
@@ -178,5 +178,9 @@ MIGRAPHX_HIP_NORETURN inline __host__ __device__ void assert_fail(const source_l
 #define MIGRAPHX_WARN(...)
 #endif

+#define MIGRAPHX_STATIC_ASSERT_FOR(...) \
+    static_assert(__VA_ARGS__);         \
+    if constexpr(__VA_ARGS__)
+
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/dpp.hpp
@@ -72,7 +72,7 @@ __device__ T dpp_mov(T& x)
    }
    return output.data;
 }
-#endif
+#endif // MIGRAPHX_HAS_DPP

 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_DPP_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -196,6 +196,14 @@ constexpr auto fold(F f)
    return [=](auto&&... xs) { return fold_impl(f, static_cast<decltype(xs)&&>(xs)...); };
 }

+template <class... Fs>
+constexpr auto compose(Fs... fs)
+{
+    return fold([](auto f, auto g) {
+        return [=](auto&&... xs) { return f(g(static_cast<decltype(xs)>(xs)...)); };
+    })(fs...);
+}
+
 template <class... Ts>
 constexpr auto pack(Ts... xs)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/gather.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/gather.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_GATHER_HPP
+#define MIGRAPHX_GUARD_KERNELS_GATHER_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+
+namespace migraphx {
+
+template <int Axis, class Input, class Indices>
+constexpr auto gather_shape(Input input, Indices indices)
+{
+    auto lengths = input.lens;
+
+    lengths[Axis] = indices.elements();
+    return make_shape(lengths, input.strides);
+}
+
+template <int Axis, class Input, class Indices, class Output>
+__device__ void gather(Input input, Indices indices, Output output)
+{
+    auto ind           = make_index();
+    auto axis_dim_size = input.get_shape().lens[Axis];
+
+    constexpr auto out_comp = gather_shape<Axis>(get_shape_c<Input>{}, get_shape_c<Indices>{});
+
+    ind.global_stride(output.get_shape().elements(), [&](auto i) {
+        auto idx      = out_comp.multi(i);
+        auto in_index = indices[idx[Axis]];
+
+        auto new_in_index = (in_index < 0) ? in_index + axis_dim_size : in_index;
+
+        idx[Axis] = new_in_index;
+
+        output[i] = input[idx];
+    });
+}
+
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
@@ -26,7 +26,7 @@

 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/algorithm.hpp>
-
+#include <migraphx/kernels/ops.hpp>
 namespace migraphx {

 template <class T>
@@ -53,23 +53,17 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
    auto indices_shape_lens = indices_shape.lens;
    auto data_shape_lens    = data_shape.lens;
    auto num_slice_dims     = indices_shape_lens.back();
-    std::size_t num_slices  = accumulate(indices_shape_lens.begin(),
-                                        indices_shape_lens.end() - 1,
-                                        1,
-                                        std::multiplies<std::size_t>());
-    std::size_t slice_size  = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
+    std::size_t num_slices =
+        accumulate(indices_shape_lens.begin(), indices_shape_lens.end() - 1, 1, op::product{});
+    std::size_t slice_size = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
                                        data_shape_lens.end(),
                                        1,
-                                        std::multiplies<std::size_t>());
-    const std::size_t num_batches       = accumulate(data_shape_lens.begin(),
-                                               data_shape_lens.begin() + batch_dims,
-                                               1,
-                                               std::multiplies<std::size_t>());
-    const std::size_t data_batch_stride = accumulate(data_shape_lens.begin() + batch_dims,
-                                                     data_shape_lens.end(),
-                                                     1,
-                                                     std::multiplies<std::size_t>());
-    const auto num_slices_per_batch     = num_slices / num_batches;
+                                        op::product{});
+    const std::size_t num_batches =
+        accumulate(data_shape_lens.begin(), data_shape_lens.begin() + batch_dims, 1, op::product{});
+    const std::size_t data_batch_stride =
+        accumulate(data_shape_lens.begin() + batch_dims, data_shape_lens.end(), 1, op::product{});
+    const auto num_slices_per_batch = num_slices / num_batches;

    ind.global_stride(output_shape.elements(), [&](auto i) {
        const auto* indices_ptr     = indices_t.data();
@@ -83,15 +77,15 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
            int64_t index                   = slice_indices[idx];
            const std::size_t input_dim_idx = batch_dims + idx;
            const auto input_dim            = data_shape_lens[input_dim_idx];
-            assert(index >= -static_cast<int64_t>(input_dim) and
-                   index < static_cast<int64_t>(input_dim));
+            MIGRAPHX_ASSERT(index >= -static_cast<int64_t>(input_dim) and
+                            index < static_cast<int64_t>(input_dim));
            if(index < 0)
                index += input_dim;
            std::size_t size_from_slice_dims =
                accumulate(data_shape_lens.begin() + batch_dims + idx + 1,
                           data_shape_lens.begin() + batch_dims + num_slice_dims,
                           slice_size,
-                           std::multiplies<std::size_t>());
+                           op::product{});
            relative_slice_offset += index * size_from_slice_dims;
        }


--- a/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
@@ -24,11 +24,14 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_HIP_HPP
 #define MIGRAPHX_GUARD_KERNELS_HIP_HPP

-// Workaround macro redefinition issue with clang tidy
-#if defined(__HIP_PLATFORM_HCC__) && defined(MIGRAPHX_USE_CLANG_TIDY)
-#undef __HIP_PLATFORM_HCC__ // NOLINT
-#endif
-
+#ifndef MIGRAPHX_USE_HIPRTC
 #include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/math_functions.h>
+#include <hip/hip_math_constants.h>
+#elif defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS)
+#include <hip/hip_common.h>
+#include <hip/hip_math_constants.h>
+#endif

 #endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -29,6 +29,7 @@
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/functional.hpp>

 namespace migraphx {

@@ -137,42 +138,100 @@ struct index
        return (n - _c<1>) / stride + _c<1>;
    }

+    template <class N>
+    constexpr auto max_global_stride_iterations(N n) const
+    {
+        return max_stride_iterations(n, nglobal());
+    }
+
+    template <class N>
+    constexpr auto max_local_stride_iterations(N n) const
+    {
+        return max_stride_iterations(n, nlocal());
+    }
+
+    template <class F, class I, class D>
+    static constexpr auto invoke_loop(F f, I i, D d) -> decltype(f(i, d))
+    {
+        return f(i, d);
+    }
+
+    template <class F, class I, class D>
+    static constexpr auto invoke_loop(F f, I i, D) -> decltype(f(i))
+    {
+        return f(i);
+    }
+
    template <class F, class N, class Stride>
+    static constexpr void for_stride_loop_unroll(index_int start, N n, Stride stride, F f)
+    {
+        sequence(max_stride_iterations(n, stride), [&](auto... ks) {
+            fold([&](auto d, auto k) {
+                auto i = start + stride * k;
+                if(i < n)
+                    invoke_loop(f, i, d);
+                return d + _c<1>;
+            })(_c<0>, ks...);
+        });
+    }
+
+    template <class F, class N, class Stride>
+    static constexpr void for_stride_loop(index_int start, N n, Stride stride, F f)
+    {
+        index_int k = 0;
+        for(index_int i = start; i < n; i += stride)
+        {
+            invoke_loop(f, i, k);
+            k++;
+        }
+    }
+
+    template <bool Unroll, class F, class N, class Stride>
    static constexpr void for_stride(index_int start, N n, Stride stride, F f)
    {
        MIGRAPHX_ASSERT(start < stride);
-        if constexpr(not is_integral<N>{} and not is_integral<Stride>{} and
-                     max_stride_iterations(n, stride) == 1)
+        if constexpr(not is_integral<N>{} and not is_integral<Stride>{})
        {
-            if constexpr(stride > n)
+            if constexpr(max_stride_iterations(n, stride) == 1)
+            {
+                if constexpr(stride > n)
+                {
+                    if(start < n)
+                        invoke_loop(f, start, _c<0>);
+                }
+                else
+                {
+                    invoke_loop(f, start, _c<0>);
+                }
+            }
+            else if constexpr(Unroll)
            {
-                if(start < n)
-                    f(start);
+                MIGRAPHX_STATIC_ASSERT_FOR(max_stride_iterations(n, stride) < 256)
+                {
+                    for_stride_loop_unroll(start, n, stride, f);
+                }
            }
            else
            {
-                f(start);
+                for_stride_loop(start, n, stride, f);
            }
        }
        else
        {
-            for(index_int i = start; i < n; i += stride)
-            {
-                f(i);
-            }
+            for_stride_loop(start, n, stride, f);
        }
    }

    template <class F, class N>
    __device__ void global_stride(N n, F f) const
    {
-        for_stride(global, n, nglobal(), f);
+        for_stride<false>(global, n, nglobal(), f);
    }

    template <class F, class N>
    __device__ void local_stride(N n, F f) const
    {
-        for_stride(local, n, nlocal(), f);
+        for_stride<true>(local, n, nlocal(), f);
    }

    template <class F, class N>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -46,28 +46,35 @@ template <index_int Axis,
 __device__ void generic_binary_layernorm(
    F compute, BinOp op, float eps, Output output, Input1 input1, Input2 input2, Inputs... inputs)
 {
+    using block         = reduce::auto_block<reduce::reduce_elements_with_axis<Input1, Axis>()>;
    using reduce_output = reduce::with_axis<Input1, Axis>;
-    reduce::block::run<reduce_output>([&](auto, auto r) {
-        using value_type         = typename Input1::type;
-        constexpr auto relements = r.template elements<Input1>();
-        auto means =
-            r.reduce(op::sum{}, make_array<vec_type<value_type>>(0, 0), [&](auto x1, auto x2) {
-                auto x = op(x1, x2);
-                return make_array(x, x * x) * vec_type<value_type>{1.0 / relements};
-            })(input1, input2);
+
+    block::template run<reduce_output>([&](auto, auto r) {
+        auto input       = r.inner([&](auto x1, auto x2) { return op(x1, x2); })(input1, input2);
+        using value_type = typename Input1::type;
+        constexpr auto relements   = r.template elements<Input1>();
+        constexpr auto relements_r = vec_type<value_type>{1.0 / relements};
+        auto relements_rsqrt       = sqrt(relements_r);
+
+        auto means = r.reduce(op::sum{}, make_array<vec_type<value_type>>(0, 0), [&](auto x) {
+            auto x_out = x * relements_r;
+            // dividing x by sqrt(relements) before squaring allows computing higher values
+            // before overflow in low precision
+            auto x2_sqrt = x * relements_rsqrt;
+            return make_array(x_out, x2_sqrt * x2_sqrt);
+        })(input);

        auto mean_x        = means[0];
        auto mean_x2       = means[1];
        auto variance      = mean_x2 - (mean_x * mean_x);
        value_type eps_val = eps; // implicit conversion for eps

-        r.inner([&](auto& y, auto x1, auto x2, auto... xs) {
-            auto x = op(x1, x2);
+        r.inner([&](auto& y, auto x, auto... xs) {
            auto m = x - mean_x;

            // m * rsqrt(mean(m ^ 2) + epsilon)
            y = compute(m * rsqrt(variance + eps_val), xs...);
-        })(output, input1, input2, inputs...);
+        })(output, input, inputs...);
    });
 }


--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -28,8 +28,7 @@
 #include <migraphx/kernels/vec.hpp>
 #include <migraphx/kernels/functional.hpp>
 #include <migraphx/kernels/type_traits.hpp>
-#include <hip/hip_fp16.h>
-#include <hip/math_functions.h>
+#include <migraphx/kernels/hip.hpp>

 namespace migraphx {

@@ -132,9 +131,14 @@ MIGRAPHX_DEVICE_MATH_FOR(float, fmod, ::fmodf)

 // Builtin half functions
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, abs, ::__habs)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, ceil, ::hceil)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, cos, ::hcos)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, exp, ::hexp)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, floor, ::hfloor)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, isnan, ::__hisnan)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, log, ::hlog)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, rsqrt, ::hrsqrt)
+// MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sin, ::hsin)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sqrt, ::hsqrt)

 // Use float to compute half overload
@@ -144,16 +148,11 @@ MIGRAPHX_DEVICE_MATH_HALF(asin, ::asin)
 MIGRAPHX_DEVICE_MATH_HALF(asinh, ::asinh)
 MIGRAPHX_DEVICE_MATH_HALF(atan, ::atan)
 MIGRAPHX_DEVICE_MATH_HALF(atanh, ::atanh)
-MIGRAPHX_DEVICE_MATH_HALF(ceil, ::ceil)
-MIGRAPHX_DEVICE_MATH_HALF(cos, ::cos)
 MIGRAPHX_DEVICE_MATH_HALF(cosh, ::cosh)
 MIGRAPHX_DEVICE_MATH_HALF(erf, ::erf)
-MIGRAPHX_DEVICE_MATH_HALF(floor, ::floor)
-MIGRAPHX_DEVICE_MATH_HALF(isnan, ::isnan)
 MIGRAPHX_DEVICE_MATH_HALF(pow, ::pow)
 MIGRAPHX_DEVICE_MATH_HALF(remainder, ::remainder)
 MIGRAPHX_DEVICE_MATH_HALF(round, ::round)
-MIGRAPHX_DEVICE_MATH_HALF(sin, ::sin)
 MIGRAPHX_DEVICE_MATH_HALF(sinh, ::sinh)
 MIGRAPHX_DEVICE_MATH_HALF(tan, ::tan)
 MIGRAPHX_DEVICE_MATH_HALF(tanh, ::tanh)
@@ -166,19 +165,19 @@ MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)
 // at this time are: exp2, exp10, log2, log10, isinf
 MIGRAPHX_DEVICE_MATH_HALF2(abs, ::__habs2)
 MIGRAPHX_DEVICE_MATH_HALF2(ceil, ::h2ceil)
-MIGRAPHX_DEVICE_MATH_HALF2(floor, ::h2floor)
-MIGRAPHX_DEVICE_MATH_HALF2(sin, ::h2sin)
 MIGRAPHX_DEVICE_MATH_HALF2(cos, ::h2cos)
 MIGRAPHX_DEVICE_MATH_HALF2(exp, ::h2exp)
-MIGRAPHX_DEVICE_MATH_HALF2(exp2, ::h2exp2)
 MIGRAPHX_DEVICE_MATH_HALF2(exp10, ::h2exp10)
-MIGRAPHX_DEVICE_MATH_HALF2(log2, ::h2log2)
+MIGRAPHX_DEVICE_MATH_HALF2(exp2, ::h2exp2)
+MIGRAPHX_DEVICE_MATH_HALF2(floor, ::h2floor)
+MIGRAPHX_DEVICE_MATH_HALF2(isinf, ::__hisinf2)
+MIGRAPHX_DEVICE_MATH_HALF2(isnan, ::__hisnan2)
 MIGRAPHX_DEVICE_MATH_HALF2(log, ::h2log)
 MIGRAPHX_DEVICE_MATH_HALF2(log10, ::h2log10)
+MIGRAPHX_DEVICE_MATH_HALF2(log2, ::h2log2)
 MIGRAPHX_DEVICE_MATH_HALF2(rsqrt, ::h2rsqrt)
+// MIGRAPHX_DEVICE_MATH_HALF2(sin, ::h2sin)
 MIGRAPHX_DEVICE_MATH_HALF2(sqrt, ::h2sqrt)
-MIGRAPHX_DEVICE_MATH_HALF2(isinf, ::__hisinf2)
-MIGRAPHX_DEVICE_MATH_HALF2(isnan, ::__hisnan2)

 template <class T, class U>
 constexpr auto where(bool cond, const T& a, const U& b)
@@ -218,6 +217,14 @@ constexpr auto min(const T& a, const U& b)
    return min<common_type_t<T, U>>(a, b);
 }

+// Sin for half is broken on hip, so use cos instead
+template <class T, MIGRAPHX_REQUIRES(is_same<vec_type<T>, half>{})>
+constexpr T sin(T x)
+{
+    constexpr const T shift = HIP_PIO2_F;
+    return migraphx::cos(shift - x);
+}
+
 MIGRAPHX_DEVICE_MATH_VEC(abs)
 MIGRAPHX_DEVICE_MATH_VEC(acos)
 MIGRAPHX_DEVICE_MATH_VEC(acosh)

--- a/src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
@@ -56,13 +56,32 @@ struct id
    }
 };

+template <class T>
+struct convert_to
+{
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(U x) const
+    {
+        return convert<T>(x);
+    }
+};
+
+template <index_int N>
 struct mean
 {
-    index_int item_num = 1;
    template <class T>
-    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x) const
    {
-        return x / static_cast<T>(item_num);
+        using type = vec_type<T>;
+        if constexpr(is_floating_point<type>{})
+        {
+            constexpr type d = 1.0 / N;
+            return x * d;
+        }
+        else
+        {
+            return x / static_cast<type>(N);
+        }
    }
 };