Merge branch 'develop' into blas_tuning

23cb7917 · Brian Pickrell · GitHub · b5fcc0bc · ea32ca70 · 23cb7917
Unverified Commit 23cb7917 authored Aug 16, 2023 by Brian Pickrell Committed by GitHub Aug 16, 2023
20 changed files
--- a/src/targets/gpu/jit/gathernd.cpp
+++ b/src/targets/gpu/jit/gathernd.cpp
@@ -44,7 +44,7 @@ namespace migraphx {

 extern "C" {

-__global__ void gathernd_kernel(void* in_data, void* in_indices, void* output) 
+MIGRAPHX_GLOBAL void gathernd_kernel(void* in_data, void* in_indices, void* output) 
 {
    make_tensors()(in_data, in_indices, output)([](auto&&... xs) { 
        auto settings = make_gathernd_settings(MIGRAPHX_MAKE_CONSTANT(int64_t{BATCH_DIMS}));
@@ -82,7 +82,7 @@ struct gathernd_compiler : compiler<gathernd_compiler>

    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
    }
 };


--- a/src/targets/gpu/jit/layernorm.cpp
+++ b/src/targets/gpu/jit/layernorm.cpp
@@ -48,7 +48,7 @@ namespace migraphx {
 ${preamble}

 extern "C" {
-__global__ void ${kernel}(${params}) 
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
 {
    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto... xs) {
        ${layernorm}<${axis}>(${post}, ${eps}, xs...);
@@ -122,7 +122,7 @@ struct layernorm_compiler : compiler<layernorm_compiler>
            v["kernel"] =
                v["layernorm"].to<std::string>() + "_" + generate_name_from_ops(*pm) + "_kernel";
        }
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), v));
+        return compile_op(ctx, to_shapes(ins->inputs()), v);
    }
 };


--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -36,19 +36,32 @@ struct mlir_compiler : compiler<mlir_compiler>

    operation compile_op(context&, const std::vector<shape>&, const value&) const { return {}; }

-    compiler_replace compile(context& ctx, instruction_ref ins, const operation&) const
+    compiler_replace
+    compile(const context& ctx, instruction_ref ins, const operation&, const value& solution) const
    {
        auto* smod = ins->module_inputs().front();
        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
-        return insert(compile_mlir(ctx, *smod, ins->inputs()));
+        return insert(compile_mlir(ctx, *smod, ins->inputs(), solution));
    }

    compiler_replace insert(code_object_op co) const
    {
-        return [co = std::move(co)](module& m, instruction_ref ins) {
-            auto mlir = insert_mlir(m, ins, co, ins->inputs());
-            m.replace_instruction(ins, mlir);
-        };
+        return {std::move(co), [](module& m, instruction_ref ins, const operation& op) {
+                    auto mlir = insert_mlir(m, ins, any_cast<code_object_op>(op), ins->inputs());
+                    m.replace_instruction(ins, mlir);
+                }};
+    }
+
+    optional<tuning_config> get_tuning_config(const context& ctx,
+                                              instruction_ref ins,
+                                              const operation&,
+                                              bool exhaustive) const
+    {
+        if(not exhaustive)
+            return nullopt;
+        auto shapes = to_shapes(ins->inputs());
+        auto* smod  = ins->module_inputs().front();
+        return get_tuning_config_mlir(ctx, *smod, shapes);
    }
 };


--- a/src/targets/gpu/jit/pad.cpp
+++ b/src/targets/gpu/jit/pad.cpp
@@ -44,7 +44,7 @@ static const char* const pointwise_kernel = R"__migraphx__(
 namespace migraphx {

 extern "C" {
-__global__ void pad_kernel(void* input_p, void* output_p) 
+MIGRAPHX_GLOBAL void pad_kernel(void* input_p, void* output_p) 
 {
    auto offsets = index_ints<${offsets}>{};
    auto idx     = make_index();
@@ -92,7 +92,7 @@ struct pad_compiler : compiler<pad_compiler>

    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
    }
 };
 } // namespace gpu

--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -44,7 +44,7 @@ namespace migraphx {
 ${preamble}

 extern "C" {
-__global__ void ${kernel}(${params}) 
+MIGRAPHX_GLOBAL void ${kernel}(${params}) 
 {
    auto idx = make_index();
    pointwise(idx, ${transformers})(${lambda}, ${args});
@@ -72,7 +72,7 @@ struct pointwise_compiler : compiler<pointwise_compiler>
        hip_compile_options options;
        options.inputs         = inputs;
        options.output         = inputs.back();
-        options.virtual_inputs = reduce_dims(inputs);
+        options.virtual_inputs = reduce_dims(normalize_permutation(inputs));
        options.params         = "-Wno-float-equal";
        auto axis              = find_fast_axis(options.virtual_inputs);
        auto vec               = vectorize::elements(ctx, axis, options.virtual_inputs);
@@ -93,10 +93,10 @@ struct pointwise_compiler : compiler<pointwise_compiler>
    {
        if(contains({"layout", "contiguous"}, op.name()))
        {
-            return replace(compile_op(
+            return compile_op(
                ctx,
                to_shapes(ins->inputs()),
-                {{"lambda", "[](auto x) { return x; }"}, {"kernel", op.name() + "_kernel"}}));
+                {{"lambda", "[](auto x) { return x; }"}, {"kernel", op.name() + "_kernel"}});
        }
        else
        {
@@ -105,10 +105,9 @@ struct pointwise_compiler : compiler<pointwise_compiler>
            auto pf            = generate_pointwise(*pm, "inner_pointwise");
            std::string lambda = "MIGRAPHX_LIFT(inner_pointwise)";
            auto kernel_name   = generate_name_from_ops(*pm) + "_kernel";
-            return replace(
-                compile_op(ctx,
-                           to_shapes(ins->inputs()),
-                           {{"lambda", lambda}, {"preamble", pf}, {"kernel", kernel_name}}));
+            return compile_op(ctx,
+                              to_shapes(ins->inputs()),
+                              {{"lambda", lambda}, {"preamble", pf}, {"kernel", kernel_name}});
        }
    }
 };

--- a/src/targets/gpu/jit/reduce.cpp
+++ b/src/targets/gpu/jit/reduce.cpp
@@ -45,7 +45,7 @@ namespace migraphx {
 ${preamble}

 extern "C" {
-__global__ void reduce_kernel(void* input_p, void* output_p) 
+MIGRAPHX_GLOBAL void reduce_kernel(void* input_p, void* output_p) 
 {
    
    transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
@@ -84,7 +84,7 @@ static shape get_reduced_shape(const shape& s, const std::vector<T>& axes)
    std::fill(lens.begin(), lens.end(), 1);
    for(const auto& axis : axes)
        lens[axis] = s.lens()[axis];
-    return shape{s.type(), lens};
+    return s.with_lens(lens);
 }

 template <class T>
@@ -93,7 +93,7 @@ static shape get_output_shape(const shape& s, const std::vector<T>& axes)
    auto lens = s.lens();
    for(const auto& axis : axes)
        lens[axis] = 1;
-    return shape{s.type(), lens};
+    return s.with_lens(lens);
 }

 template <class ReduceLens>
@@ -189,7 +189,7 @@ struct simple_reduce_compiler : compiler<simple_reduce_compiler>
        v["read"]      = r.read;
        v["write"]     = r.write;
        v["init"]      = r.init;
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), v));
+        return compile_op(ctx, to_shapes(ins->inputs()), v);
    }
 };

@@ -228,7 +228,7 @@ struct fused_reduce_compiler : compiler<fused_reduce_compiler>
        auto virtual_inputs = inputs;
        virtual_inputs.push_back(get_reduced_shape(inputs.front(), axes));
        virtual_inputs.push_back(get_output_shape(inputs.front(), axes));
-        virtual_inputs           = reduce_dims(virtual_inputs);
+        virtual_inputs           = reduce_dims(normalize_permutation(virtual_inputs));
        auto reduce_output_shape = virtual_inputs.back();
        virtual_inputs.pop_back();
        auto reduction_shape = virtual_inputs.back();
@@ -285,7 +285,7 @@ struct fused_reduce_compiler : compiler<fused_reduce_compiler>
        v["preamble"] = generate_reduce(*rm, "fused_reduce_op");
        v["lambda"]   = "MIGRAPHX_LIFT(fused_reduce_op)";
        v["kernel"]   = generate_name_from_ops(*rm) + "_kernel";
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), v));
+        return compile_op(ctx, to_shapes(ins->inputs()), v);
    }
 };
 } // namespace gpu

--- a/src/targets/gpu/jit/roialign.cpp
+++ b/src/targets/gpu/jit/roialign.cpp
@@ -41,7 +41,7 @@ namespace migraphx {

 extern "C" {

-__global__ void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y) 
+MIGRAPHX_GLOBAL void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y) 
 {
    make_tensors()(in_x, in_rois, in_ind, y)([](auto&&... xs) {
        auto settings = make_roalign_settings(MIGRAPHX_MAKE_CONSTANT(float{ROIS_OFFSET}),
@@ -92,7 +92,7 @@ struct roialign_compiler : compiler<roialign_compiler>

    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
    }
 };


--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -42,7 +42,7 @@ namespace migraphx {

 extern "C" {

-__global__ void scatternd_kernel(void* in_indices, void* in_updates, void* output) 
+MIGRAPHX_GLOBAL void scatternd_kernel(void* in_indices, void* in_updates, void* output) 
 {
    make_tensors()(in_indices, in_updates, output)([](auto&&... xs) { 
        scatternd(xs..., ${reduction}{}); 
@@ -85,15 +85,15 @@ struct scatternd_compiler : compiler<scatternd_compiler>
            {{"reduction", reduction}}));
    }

-    compiler_replace insert(const operation& op) const
+    compiler_replace insert(const operation& co) const
    {
-        return [=](module& m, instruction_ref ins) {
-            auto args = ins->inputs();
-            args.back() =
-                m.insert_instruction(ins, make_op("hip::copy"), args.front(), args.back());
-            args.erase(args.begin());
-            return m.replace_instruction(ins, op, args);
-        };
+        return {co, [](module& m, instruction_ref ins, const operation& op) {
+                    auto args = ins->inputs();
+                    args.back() =
+                        m.insert_instruction(ins, make_op("hip::copy"), args.front(), args.back());
+                    args.erase(args.begin());
+                    return m.replace_instruction(ins, op, args);
+                }};
    }
 };


--- a/src/targets/gpu/jit/softmax.cpp
+++ b/src/targets/gpu/jit/softmax.cpp
@@ -45,7 +45,7 @@ static const char* const softmax_kernel = R"__migraphx__(
 namespace migraphx {

 extern "C" {
-__global__ void softmax_kernel(void* input_p, void* output_p) 
+MIGRAPHX_GLOBAL void softmax_kernel(void* input_p, void* output_p) 
 {
    transform_args(make_tensors(), ${transformers})(input_p, output_p)([](auto input, auto output) {
        softmax<${axis}>(input, output);
@@ -95,7 +95,7 @@ struct softmax_compiler : compiler<softmax_compiler>

    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+        return compile_op(ctx, to_shapes(ins->inputs()), op.to_value());
    }
 };


--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -272,6 +272,18 @@ struct integral_const_array : array<T, sizeof...(Xs)>
    MIGRAPHX_DEVICE_CONSTEXPR integral_const_array() : base_array({Xs...}) {}
 };

+template <class T, class... Ts>
+constexpr auto make_const_array(T x, Ts... xs)
+{
+    return integral_const_array<typename T::value_type, x, xs...>{};
+}
+
+template <class T, T... Xs, class F>
+constexpr auto unpack(integral_const_array<T, Xs...>, F f)
+{
+    return f(_c<Xs>...);
+}
+
 template <class T, T... Xs, class F>
 constexpr auto transform(integral_const_array<T, Xs...>, F f)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_CK_HPP
+#define MIGRAPHX_GUARD_KERNELS_CK_HPP
+
+#include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <ck/utility/common_header.hpp>
+#include <ck/tensor_description/tensor_descriptor.hpp>
+#include <ck/tensor_description/tensor_descriptor_helper.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
+
+namespace migraphx {
+
+namespace detail {
+template <class T>
+struct to_ck_type_impl
+{
+    using type = T;
+};
+template <>
+struct to_ck_type_impl<migraphx::half>
+{
+    using type = ck::half_t;
+};
+
+template <class T>
+struct to_ck_type_impl<const T>
+{
+    using type = const typename to_ck_type_impl<T>::type;
+};
+
+template <class Shape>
+constexpr bool is_row_major()
+{
+    constexpr auto strides = Shape{}.strides;
+    MIGRAPHX_ASSERT(strides.size() >= 2);
+    if(strides.back() == 1)
+    {
+        MIGRAPHX_ASSERT(not Shape{}.is_transposed());
+        return true;
+    }
+    MIGRAPHX_ASSERT(strides[strides.size() - 2] == 1);
+
+    return false;
+}
+
+} // namespace detail
+
+template <class T>
+using to_ck_type = typename detail::to_ck_type_impl<T>::type;
+
+template <class T>
+constexpr auto to_ck_pointer(T* x)
+{
+    return static_cast<to_ck_type<T>*>(x);
+}
+
+template <class T>
+constexpr auto to_ck_const_pointer(const T* x)
+{
+    return static_cast<const to_ck_type<T>*>(x);
+}
+
+template <class Shape>
+using to_ck_gemm_layout = conditional_t<detail::is_row_major<get_shape_c<Shape>>(),
+                                        ck::tensor_layout::gemm::RowMajor,
+                                        ck::tensor_layout::gemm::ColumnMajor>;
+
+template <class Tensor>
+constexpr auto to_ck_tensor()
+{
+    constexpr auto s = get_shape_c<Tensor>{};
+    return sequence(s.lens.size(), [&](auto... is) {
+        return ck::make_naive_tensor_descriptor(ck::make_tuple(s.lens[is]...),
+                                                ck::make_tuple(s.strides[is]...));
+    });
+}
+
+template <class F>
+struct ck_function_adaptor : F
+{
+    template <class... Ts>
+    constexpr ck_function_adaptor(Ts&&... xs) : F(static_cast<Ts&&>(xs)...)
+    {
+    }
+
+    template <class T, class... Ts>
+    constexpr void operator()(T& out, Ts&&... xs) const
+    {
+        out = static_cast<const F&>(*this)(static_cast<Ts&&>(xs)...);
+    }
+};
+
+struct ck_nop
+{
+    template <class T>
+    constexpr void operator()(T&) const
+    {
+    }
+};
+
+struct ck_passthrough
+{
+    template <class T, class U>
+    constexpr void operator()(T& y, U x) const
+    {
+        y = x;
+    }
+};
+
+struct ck_scale
+{
+    constexpr ck_scale(float s) : scale(s) {}
+
+    template <class T, class U>
+    constexpr void operator()(T& y, U x) const
+    {
+        y = x * static_cast<U>(scale);
+    }
+
+    float scale;
+};
+
+struct ck_add
+{
+    template <class T, class U>
+    constexpr void operator()(T& y, U x) const
+    {
+        y += x;
+    }
+};
+
+#ifdef MIGRAPHX_CK_CHECK
+#define MIGRAPHX_CK_STATIC_ASSERT static_assert
+#else
+#define MIGRAPHX_CK_STATIC_ASSERT(...)
+#endif
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_CK_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP
+#define MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/ck.hpp>
+#include <migraphx/kernels/gemm_batcher.hpp>
+
+namespace migraphx {
+
+// In CK, the B matrix is ordered as N,K instead of K,N
+template <class Dims>
+constexpr auto ck_transposeb_dims(Dims dims)
+{
+    return unpack(dims, [](auto k, auto n) { return make_const_array(n, k); });
+}
+
+template <class Tensor>
+using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c<Tensor>{}.lens),
+                                          ck_transposeb_dims(get_shape_c<Tensor>{}.strides)));
+
+template <class G, class E, class A, class B, class... Ds>
+__device__ void ck_gemm_matrix(E e, A a, B b, Ds... ds)
+{
+    constexpr auto desc = G::make_descriptor(to_ck_tensor<A>(),
+                                             to_ck_tensor<ck_transposeb<B>>(),
+                                             ck::make_tuple(to_ck_tensor<Ds>()...),
+                                             to_ck_tensor<E>());
+
+    static_assert(desc.IsValid(), "Invalid ck gemm.");
+
+    G::Run(desc,
+           to_ck_const_pointer(a.data()),
+           to_ck_const_pointer(b.data()),
+           ck::make_tuple(to_ck_const_pointer(ds.data())...),
+           to_ck_pointer(e.data()));
+}
+
+template <class G, index_int BlocksPerBatch, class... Ts>
+__device__ void ck_gemm(Ts... xs)
+{
+    gemm_batch_args(make_index(), _c<BlocksPerBatch>, xs...)(
+        [](auto... ys) { ck_gemm_matrix<G>(ys...); });
+}
+
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
@@ -122,12 +122,14 @@ struct source_location_capture
 {
    T x;
    source_location loc;
-    template <class U, class = decltype(T(U{}))>
+    // declval is a workaround since default constructor for "U" is not working with rocm-5.6
+    template <class U>
+    static U&& declval();
+    template <class U, class = decltype(T(declval<U>()))>
    constexpr source_location_capture(U px, source_location ploc = source_location{})
        : x(px), loc(ploc)
    {
    }
-
    constexpr operator source_location() const { return loc; }

    constexpr operator T() const { return x; }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -32,8 +32,17 @@

 // NOLINTNEXTLINE
 #define MIGRAPHX_LIFT(...)                           \
-    [](auto&&... private_lisft_xs) MIGRAPHX_RETURNS( \
-        (__VA_ARGS__)(static_cast<decltype(private_lisft_xs)>(private_lisft_xs)...))
+    [](auto&&... private_lifts_xs) MIGRAPHX_RETURNS( \
+        (__VA_ARGS__)(static_cast<decltype(private_lifts_xs)>(private_lifts_xs)...))
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_LIFT_CLASS(name, ...)                                                         \
+    struct name                                                                                \
+    {                                                                                          \
+        template <class... PrivateLiftTs>                                                      \
+        constexpr auto operator()(PrivateLiftTs&&... private_lifts_xs) const MIGRAPHX_RETURNS( \
+            (__VA_ARGS__)(static_cast<decltype(private_lifts_xs)>(private_lifts_xs)...))       \
+    }

 namespace migraphx {


--- a/src/targets/gpu/kernels/include/migraphx/kernels/gemm_batcher.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/gemm_batcher.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
+#define MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
+
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/index.hpp>
+
+namespace migraphx {
+
+template <class Tensor>
+constexpr auto gemm_get_batches()
+{
+    constexpr auto lens     = get_shape_c<Tensor>{}.lens;
+    constexpr auto strides  = get_shape_c<Tensor>{}.strides;
+    constexpr auto new_lens = sequence(
+        lens.size() - _c<2>, [&](auto... is) { return make_const_array(_c<lens[is]>...); });
+    constexpr auto new_strides = sequence(
+        strides.size() - _c<2>, [&](auto... is) { return make_const_array(_c<strides[is]>...); });
+    return make_shape(new_lens, new_strides);
+}
+
+template <class Tensor>
+constexpr auto gemm_get_matrix()
+{
+    constexpr auto lens        = get_shape_c<Tensor>{}.lens;
+    constexpr auto strides     = get_shape_c<Tensor>{}.strides;
+    constexpr auto m           = lens.size() - _c<2>;
+    constexpr auto n           = lens.size() - _c<1>;
+    constexpr auto new_lens    = make_const_array(_c<lens[m]>, _c<lens[n]>);
+    constexpr auto new_strides = make_const_array(_c<strides[m]>, _c<strides[n]>);
+    return make_shape(new_lens, new_strides);
+}
+
+template <class Tensor, class T>
+constexpr auto gemm_batch_slice(Tensor t, T i)
+{
+    constexpr auto batch  = gemm_get_batches<Tensor>();
+    constexpr auto matrix = gemm_get_matrix<Tensor>();
+    MIGRAPHX_ASSERT((batch.index(i) + matrix.element_space()) <= t.get_shape().element_space());
+    return make_tensor_view(t.data() + batch.index(i), matrix);
+}
+
+template <class BlocksPerBatch, class T, class... Ts>
+constexpr auto gemm_batch_args(index idx, BlocksPerBatch bpb, T x, Ts... xs)
+{
+    return [=](auto f) {
+        // All tensors should have the same rank
+        static_assert(
+            (true and ... and (get_shape_c<T>{}.lens.size() == get_shape_c<Ts>{}.lens.size())));
+        if constexpr(get_shape_c<T>{}.lens.size() > 2)
+        {
+            // Get the first batch since all batches should have the same number of elements
+            constexpr auto batch = gemm_get_batches<T>();
+            static_assert(
+                (true and ... and (batch.elements() == gemm_get_batches<Ts>().elements())));
+            idx.group_stride(bpb * batch.elements(), [&](auto gidx) {
+                const auto batch_idx = gidx / bpb;
+                f(gemm_batch_slice(x, batch_idx), gemm_batch_slice(xs, batch_idx)...);
+            });
+        }
+        else
+        {
+            f(x, xs...);
+        }
+    };
+}
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_GEMM_BATCHER_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
@@ -28,10 +28,6 @@
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
 #include <hip/math_functions.h>
-#include <hip/hip_math_constants.h>
-#elif defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS)
-#include <hip/hip_common.h>
-#include <hip/hip_math_constants.h>
 #endif

 #endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -130,6 +130,8 @@ struct index
        return blockDim.x;
    }
 #endif
+
+    constexpr auto ngroup() const { return nglobal() / max_nlocal(); }
    template <class N, class Stride>
    static constexpr auto max_stride_iterations(N n, Stride stride)
    {
@@ -231,6 +233,12 @@ struct index
    {
        for_stride<true>(local, n, nlocal(), f);
    }
+
+    template <class F, class N>
+    __device__ void group_stride(N n, F f) const
+    {
+        for_stride<false>(group, n, ngroup(), f);
+    }
 };

 #ifdef MIGRAPHX_NLOCAL

--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -138,7 +138,7 @@ MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, floor, ::hfloor)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, isnan, ::__hisnan)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, log, ::hlog)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, rsqrt, ::hrsqrt)
-// MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sin, ::hsin)
+MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sin, ::hsin)
 MIGRAPHX_DEVICE_MATH_FOR(migraphx::half, sqrt, ::hsqrt)

 // Use float to compute half overload
@@ -161,8 +161,7 @@ MIGRAPHX_DEVICE_MATH_HALF(fmod, ::fmod)
 // Map math functions to hip half2 functions
 // The half2 type is defined in include/hip/amd_detail/hip_fp16_gcc.h and is 2 16-bit floats
 // packed into a 32-bit number.  See include/hip/amd_detail/hip_fp16_math_fwd.h for the HIP names
-// Most but not all of these math ops have operators of the same names.  Ones not yet implemented
-// at this time are: exp2, exp10, log2, log10, isinf
+// Most but not all of these math ops have operators of the same names.
 MIGRAPHX_DEVICE_MATH_HALF2(abs, ::__habs2)
 MIGRAPHX_DEVICE_MATH_HALF2(ceil, ::h2ceil)
 MIGRAPHX_DEVICE_MATH_HALF2(cos, ::h2cos)
@@ -176,7 +175,7 @@ MIGRAPHX_DEVICE_MATH_HALF2(log, ::h2log)
 MIGRAPHX_DEVICE_MATH_HALF2(log10, ::h2log10)
 MIGRAPHX_DEVICE_MATH_HALF2(log2, ::h2log2)
 MIGRAPHX_DEVICE_MATH_HALF2(rsqrt, ::h2rsqrt)
-// MIGRAPHX_DEVICE_MATH_HALF2(sin, ::h2sin)
+MIGRAPHX_DEVICE_MATH_HALF2(sin, ::h2sin)
 MIGRAPHX_DEVICE_MATH_HALF2(sqrt, ::h2sqrt)

 template <class T, class U>
@@ -189,9 +188,8 @@ MIGRAPHX_DEVICE_MATH_BINARY_FOR(float, max, ::max)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(float, min, ::min)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, max, ::max)
 MIGRAPHX_DEVICE_MATH_BINARY_FOR(double, min, ::min)
-// Add overloads for half that calls the float version
-MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, max, ::fmaxf)
-MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, min, ::fminf)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, max, ::__hmax)
+MIGRAPHX_DEVICE_MATH_BINARY_FOR(migraphx::half, min, ::__hmin)

 template <class T, MIGRAPHX_REQUIRES(not is_any_vec<T>())>
 constexpr auto max(const T& a, const T& b)
@@ -217,14 +215,6 @@ constexpr auto min(const T& a, const U& b)
    return min<common_type_t<T, U>>(a, b);
 }

-// Sin for half is broken on hip, so use cos instead
-template <class T, MIGRAPHX_REQUIRES(is_same<vec_type<T>, half>{})>
-constexpr T sin(T x)
-{
-    constexpr const T shift = HIP_PIO2_F;
-    return migraphx::cos(shift - x);
-}
-
 MIGRAPHX_DEVICE_MATH_VEC(abs)
 MIGRAPHX_DEVICE_MATH_VEC(acos)
 MIGRAPHX_DEVICE_MATH_VEC(acosh)

--- a/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
@@ -244,13 +244,13 @@ __device__ void print_once(Ts... xs)
 template <class... Ts>
 __device__ void println(Ts... xs)
 {
-    print_each(&coutln, xs...);
+    print_each(&cout, xs..., '\n');
 }

 template <class... Ts>
 __device__ void println_once(Ts... xs)
 {
-    print_each_once(&coutln, xs...);
+    print_each_once(&cout, xs..., '\n');
 }

 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -79,20 +79,21 @@ __device__ void dpp_reduce(T& in, Op op)
 #endif

 // NOLINTNEXTLINE
-#define MIGRAPHX_DPP_REDUCE(op, prefix)                                                            \
+#define MIGRAPHX_DPP_REDUCE(op, prefix, sign)                                                      \
    __device__ inline void dpp_reduce(double& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f64); } \
    __device__ inline void dpp_reduce(float& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f32); }  \
    __device__ inline void dpp_reduce(half& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f16); }   \
    __device__ inline void dpp_reduce(int32_t& x, op)                                              \
    {                                                                                              \
-        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32);                                                  \
+        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##sign##32);                                              \
    }                                                                                              \
    __device__ inline void dpp_reduce(uint32_t& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); }

-MIGRAPHX_DPP_REDUCE(op::sum, v_add)
-MIGRAPHX_DPP_REDUCE(op::max, v_max)
-MIGRAPHX_DPP_REDUCE(op::min, v_min)
-MIGRAPHX_DPP_REDUCE(op::product, v_mul)
+// Note: when max and min are in int32_t, signed version of instruction needs to be used.
+MIGRAPHX_DPP_REDUCE(op::sum, v_add, _u)
+MIGRAPHX_DPP_REDUCE(op::product, v_mul, _u)
+MIGRAPHX_DPP_REDUCE(op::max, v_max, _i)
+MIGRAPHX_DPP_REDUCE(op::min, v_min, _i)

 template <class Op, class T, class Index, class F>
 __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
@@ -570,7 +571,7 @@ template <class Algo, class Reduced, class Output, class F>
 __device__ void fused_reduce(Output output, F f)
 {
    Algo::template run<Reduced>([&](auto out_idx, auto r) {
-        auto result = f(r);
+        auto result = f(r, out_idx);
        if constexpr(reduce::is_inner_storage<decltype(result)>{})
        {
            r.inner([&](auto& y, auto x) { y = x; })(output, result);