Merge branch 'dyn_batch_pass' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_test_runner

5af9aac0 · charlie · 7b2516e0 · 05e81ed3 · 5af9aac0 · 5af9aac0
Commit 5af9aac0 authored Feb 21, 2023 by charlie
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/gathernd.hpp
@@ -26,7 +26,7 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/ops.hpp>
 namespace migraphx {
 template <class T>
@@ -53,22 +53,16 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
    auto indices_shape_lens = indices_shape.lens;
    auto data_shape_lens    = data_shape.lens;
    auto num_slice_dims     = indices_shape_lens.back();
-    std::size_t num_slices  = accumulate(indices_shape_lens.begin(),
+    std::size_t num_slices =
-                                        indices_shape_lens.end() - 1,
+        accumulate(indices_shape_lens.begin(), indices_shape_lens.end() - 1, 1, op::product{});
-                                        1,
-                                        std::multiplies<std::size_t>());
    std::size_t slice_size = accumulate(data_shape_lens.begin() + num_slice_dims + batch_dims,
                                        data_shape_lens.end(),
                                        1,
-                                        std::multiplies<std::size_t>());
+                                        op::product{});
-    const std::size_t num_batches       = accumulate(data_shape_lens.begin(),
+    const std::size_t num_batches =
-                                               data_shape_lens.begin() + batch_dims,
+        accumulate(data_shape_lens.begin(), data_shape_lens.begin() + batch_dims, 1, op::product{});
-                                               1,
+    const std::size_t data_batch_stride =
-                                               std::multiplies<std::size_t>());
+        accumulate(data_shape_lens.begin() + batch_dims, data_shape_lens.end(), 1, op::product{});
-    const std::size_t data_batch_stride = accumulate(data_shape_lens.begin() + batch_dims,
-                                                     data_shape_lens.end(),
-                                                     1,
-                                                     std::multiplies<std::size_t>());
    const auto num_slices_per_batch = num_slices / num_batches;
    ind.global_stride(output_shape.elements(), [&](auto i) {
@@ -83,7 +77,7 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
            int64_t index                   = slice_indices[idx];
            const std::size_t input_dim_idx = batch_dims + idx;
            const auto input_dim            = data_shape_lens[input_dim_idx];
-            assert(index >= -static_cast<int64_t>(input_dim) and
+            MIGRAPHX_ASSERT(index >= -static_cast<int64_t>(input_dim) and
                            index < static_cast<int64_t>(input_dim));
            if(index < 0)
                index += input_dim;
@@ -91,7 +85,7 @@ __device__ void gathernd(const T& data_t, const U& indices_t, const V& output_t,
                accumulate(data_shape_lens.begin() + batch_dims + idx + 1,
                           data_shape_lens.begin() + batch_dims + num_slice_dims,
                           slice_size,
-                           std::multiplies<std::size_t>());
+                           op::product{});
            relative_slice_offset += index * size_from_slice_dims;
        }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/hip.hpp
@@ -24,11 +24,14 @@
 #ifndef MIGRAPHX_GUARD_KERNELS_HIP_HPP
 #define MIGRAPHX_GUARD_KERNELS_HIP_HPP
-// Workaround macro redefinition issue with clang tidy
+#ifndef MIGRAPHX_USE_HIPRTC
-#if defined(__HIP_PLATFORM_HCC__) && defined(MIGRAPHX_USE_CLANG_TIDY)
-#undef __HIP_PLATFORM_HCC__ // NOLINT
-#endif
 #include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/math_functions.h>
+#include <hip/hip_math_constants.h>
+#elif defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS)
+#include <hip/hip_common.h>
+#include <hip/hip_math_constants.h>
+#endif
 #endif // MIGRAPHX_GUARD_KERNELS_HIP_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -29,6 +29,7 @@
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/functional.hpp>
 namespace migraphx {
@@ -135,42 +136,100 @@ struct index
        return (n - _c<1>) / stride + _c<1>;
    }
+    template <class N>
+    constexpr auto max_global_stride_iterations(N n) const
+    {
+        return max_stride_iterations(n, nglobal());
+    }
+    template <class N>
+    constexpr auto max_local_stride_iterations(N n) const
+    {
+        return max_stride_iterations(n, nlocal());
+    }
+    template <class F, class I, class D>
+    static constexpr auto invoke_loop(F f, I i, D d) -> decltype(f(i, d))
+    {
+        return f(i, d);
+    }
+    template <class F, class I, class D>
+    static constexpr auto invoke_loop(F f, I i, D) -> decltype(f(i))
+    {
+        return f(i);
+    }
+    template <class F, class N, class Stride>
+    static constexpr void for_stride_loop_unroll(index_int start, N n, Stride stride, F f)
+    {
+        sequence(max_stride_iterations(n, stride), [&](auto... ks) {
+            fold([&](auto d, auto k) {
+                auto i = start + stride * k;
+                if(i < n)
+                    invoke_loop(f, i, d);
+                return d + _c<1>;
+            })(_c<0>, ks...);
+        });
+    }
    template <class F, class N, class Stride>
+    static constexpr void for_stride_loop(index_int start, N n, Stride stride, F f)
+    {
+        index_int k = 0;
+        for(index_int i = start; i < n; i += stride)
+        {
+            invoke_loop(f, i, k);
+            k++;
+        }
+    }
+    template <bool Unroll, class F, class N, class Stride>
    static constexpr void for_stride(index_int start, N n, Stride stride, F f)
    {
        MIGRAPHX_ASSERT(start < stride);
-        if constexpr(not is_integral<N>{} and not is_integral<Stride>{} and
+        if constexpr(not is_integral<N>{} and not is_integral<Stride>{})
-                     max_stride_iterations(n, stride) == 1)
+        {
+            if constexpr(max_stride_iterations(n, stride) == 1)
            {
                if constexpr(stride > n)
                {
                    if(start < n)
-                    f(start);
+                        invoke_loop(f, start, _c<0>);
                }
                else
                {
-                f(start);
+                    invoke_loop(f, start, _c<0>);
                }
            }
-        else
+            else if constexpr(Unroll)
            {
-            for(index_int i = start; i < n; i += stride)
+                MIGRAPHX_STATIC_ASSERT_FOR(max_stride_iterations(n, stride) < 256)
+                {
+                    for_stride_loop_unroll(start, n, stride, f);
+                }
+            }
+            else
            {
-                f(i);
+                for_stride_loop(start, n, stride, f);
+            }
        }
+        else
+        {
+            for_stride_loop(start, n, stride, f);
        }
    }
    template <class F, class N>
    __device__ void global_stride(N n, F f) const
    {
-        for_stride(global, n, nglobal(), f);
+        for_stride<false>(global, n, nglobal(), f);
    }
    template <class F, class N>
    __device__ void local_stride(N n, F f) const
    {
-        for_stride(local, n, nlocal(), f);
+        for_stride<true>(local, n, nlocal(), f);
    }
 };

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -46,28 +46,27 @@ template <index_int Axis,
 __device__ void generic_binary_layernorm(
    F compute, BinOp op, float eps, Output output, Input1 input1, Input2 input2, Inputs... inputs)
 {
+    using block         = reduce::auto_block<reduce::reduce_elements_with_axis<Input1, Axis>()>;
    using reduce_output = reduce::with_axis<Input1, Axis>;
-    reduce::block::run<reduce_output>([&](auto, auto r) {
+    block::template run<reduce_output>([&](auto, auto r) {
+        auto input       = r.inner([&](auto x1, auto x2) { return op(x1, x2); })(input1, input2);
        using value_type = typename Input1::type;
        constexpr auto relements = r.template elements<Input1>();
-        auto means =
+        auto means = r.reduce(op::sum{}, make_array<vec_type<value_type>>(0, 0), [&](auto x) {
-            r.reduce(op::sum{}, make_array<vec_type<value_type>>(0, 0), [&](auto x1, auto x2) {
-                auto x = op(x1, x2);
            return make_array(x, x * x) * vec_type<value_type>{1.0 / relements};
-            })(input1, input2);
+        })(input);
        auto mean_x        = means[0];
        auto mean_x2       = means[1];
        auto variance      = mean_x2 - (mean_x * mean_x);
        value_type eps_val = eps; // implicit conversion for eps
-        r.inner([&](auto& y, auto x1, auto x2, auto... xs) {
+        r.inner([&](auto& y, auto x, auto... xs) {
-            auto x = op(x1, x2);
            auto m = x - mean_x;
            // m * rsqrt(mean(m ^ 2) + epsilon)
            y = compute(m * rsqrt(variance + eps_val), xs...);
-        })(output, input1, input2, inputs...);
+        })(output, input, inputs...);
    });
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/math.hpp
@@ -28,8 +28,7 @@
 #include <migraphx/kernels/vec.hpp>
 #include <migraphx/kernels/functional.hpp>
 #include <migraphx/kernels/type_traits.hpp>
-#include <hip/hip_fp16.h>
+#include <migraphx/kernels/hip.hpp>
-#include <hip/math_functions.h>
 namespace migraphx {
@@ -222,7 +221,7 @@ constexpr auto min(const T& a, const U& b)
 template <class T, MIGRAPHX_REQUIRES(is_same<vec_type<T>, half>{})>
 constexpr T sin(T x)
 {
-    constexpr const T shift = M_PI_2;
+    constexpr const T shift = HIP_PIO2_F;
    return migraphx::cos(shift - x);
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ops.hpp
@@ -56,13 +56,32 @@ struct id
    }
 };
+template <class T>
+struct convert_to
+{
+    template <class U>
+    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(U x) const
+    {
+        return convert<T>(x);
+    }
+};
+template <index_int N>
 struct mean
 {
-    index_int item_num = 1;
    template <class T>
-    MIGRAPHX_DEVICE_CONSTEXPR auto operator()(T x) const
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x) const
+    {
+        using type = vec_type<T>;
+        if constexpr(is_floating_point<type>{})
        {
-        return x / static_cast<T>(item_num);
+            constexpr type d = 1.0 / N;
+            return x * d;
+        }
+        else
+        {
+            return x / static_cast<type>(N);
+        }
    }
 };

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -103,10 +103,10 @@ __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 #else
    constexpr index_int lanes_per_thread = 64;
 #endif
-    using type = decltype(f(0));
+    using type = decltype(index::invoke_loop(f, 0, _c<0>));
    __shared__ type buffer[idx.max_nlocal() / lanes_per_thread];
    type x = init;
-    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
+    idx.local_stride(n, [&](auto i, auto d) { x = op(x, index::invoke_loop(f, i, d)); });
    dpp_reduce(x, op);
    const auto ldsidx = idx.local / lanes_per_thread;
@@ -128,10 +128,10 @@ template <class Op, class T, class Index, class F>
 __device__ auto block_reduce(index idx, Op op, T init, Index n, F f)
 {
    MIGRAPHX_ASSERT(idx.max_nlocal() == idx.nlocal());
-    using type = decltype(f(0));
+    using type = decltype(index::invoke_loop(f, 0, _c<0>));
    __shared__ type buffer[idx.max_nlocal()];
    type x = init;
-    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
+    idx.local_stride(n, [&](auto i, auto d) { x = op(x, index::invoke_loop(f, i, d)); });
    buffer[idx.local] = x;
    __syncthreads();
@@ -167,6 +167,25 @@ constexpr auto reduce_slice(Input input, T i)
 namespace reduce {
+struct inner_storage_tag
+{
+};
+template <class T>
+using is_inner_storage = is_base_of<inner_storage_tag, remove_cv_t<remove_reference_t<T>>>;
+template <class R, class F>
+struct storage_access : F
+{
+    using type = R;
+};
+template <class R, class F>
+constexpr storage_access<R, F> make_storage_access(F f)
+{
+    return {{f}};
+}
 template <class Slicer, class F>
 constexpr auto sliced(Slicer slicer, F f)
 {
@@ -191,42 +210,100 @@ constexpr auto compute_reduce_axis()
 template <class Input, index_int Axis>
 using with_axis = decltype(compute_reduce_axis<Input, Axis>());
-struct block
+template <class Derived>
+struct reducer_base
 {
-    template <class Slicer>
+    template <class T>
-    struct reducer
+    __device__ auto make_inner_slice(T x) const
    {
-        index idx;
+        if constexpr(is_inner_storage<T>{})
-        Slicer slice;
-        template <class Op, class T, class Read>
-        __device__ auto reduce(Op op, T init, Read read) const
        {
-            return sliced(slice, [=](auto x, auto... xs) {
+            return x;
-                return block_reduce(idx, op, init, x.get_shape().elements(), [&](auto j) {
+        }
-                    return vec_reduce(read(x[j], xs[j]...), op);
+        else
-                });
+        {
+            auto&& derived = static_cast<const Derived&>(*this);
+            auto t         = derived.slice(x);
+            return make_storage_access<typename decltype(t)::type>([=](auto i, auto...) -> auto& {
+                return t[i];
            });
        }
+    }
+    template <class T, class... Ts>
+    constexpr auto get_size(T&& x, [[maybe_unused]] Ts&&... xs) const
+    {
+        MIGRAPHX_ASSERT(get_size(x) == get_size(xs...));
+        return get_size(x);
+    }
+    template <class T, class... Ts>
+    constexpr auto get_size(T&& x) const
+    {
+        if constexpr(is_inner_storage<T>{})
+        {
+            return x.rsize();
+        }
+        else
+        {
+            auto&& derived = static_cast<const Derived&>(*this);
+            auto t         = derived.slice(x);
+            return t.size();
+        }
+    }
    template <class F>
-        __device__ void outer(F f) const
+    __device__ auto inner_sliced(F f) const
    {
-            if(idx.local == 0)
+        return [=](auto&&... xs) { return f(get_size(xs...), make_inner_slice(xs)...); };
-                f();
    }
+    template <class T>
+    static __device__ typename T::type& decl_inner_storage(const T&);
    template <class F>
    __device__ auto inner(F f) const
    {
-            return sliced(slice, [=](auto x, auto... xs) {
+        return this->inner_sliced([=](auto n, auto&&... xs) {
-                idx.local_stride(x.get_shape().elements(), [&](auto j) { f(x[j], xs[j]...); });
+            using result_type = decltype(f(decl_inner_storage(xs)...));
+            auto&& derived    = static_cast<const Derived&>(*this);
+            if constexpr(is_void<result_type>{})
+            {
+                derived.inner_void_impl(f, n, xs...);
+            }
+            else
+            {
+                return derived.template inner_impl<result_type>(f, n, xs...);
+            }
        });
    }
+    template <class Op, class T, class Read>
+    __device__ auto reduce(Op op, T init, Read read) const
+    {
+        return this->inner_sliced([=](auto n, auto&&... xs) {
+            auto&& derived = static_cast<const Derived&>(*this);
+            return derived.reduce_impl(op, init, read, n, xs...);
+        });
+    }
+    template <class Op, class T>
+    __device__ auto reduce(Op op, T init) const
+    {
+        return this->reduce(op, init, op::id{});
+    }
+    template <class F>
+    __device__ void outer(F f) const
+    {
+        f();
+    }
    template <class Input>
    constexpr auto elements() const
    {
-            using reduce_type        = decltype(slice(Input{}));
+        auto&& derived           = static_cast<const Derived&>(*this);
+        using reduce_type        = decltype(derived.slice(Input{}));
        using value_type         = typename Input::type;
        constexpr auto relements = get_shape_c<reduce_type>{}.elements();
        if constexpr(vec_size<value_type>() > 1)
@@ -234,12 +311,69 @@ struct block
        else
            return relements;
    }
+};
+struct block
+{
+    template <class Slicer>
+    struct reducer : reducer_base<reducer<Slicer>>
+    {
+        index idx;
+        Slicer slice;
+        template <class T, index_int N, class Size>
+        struct inner_storage : inner_storage_tag
+        {
+            using type = T;
+            array<T, N> arr;
+            constexpr Size rsize() const { return {}; }
+            template <class U, class V>
+            constexpr auto& operator()(U, V d) const
+            {
+                return arr[d];
+            }
+            template <class U, class V>
+            constexpr auto& operator()(U, V d)
+            {
+                return arr[d];
+            }
+        };
+        template <class Op, class T, class Read, class N, class... Ts>
+        __device__ auto reduce_impl(Op op, T init, Read read, N n, Ts&&... xs) const
+        {
+            return block_reduce(idx, op, init, n, [&](auto j, auto d) {
+                return vec_reduce(read(xs(j, d)...), op);
+            });
+        }
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            if(idx.local == 0)
+                f();
+        }
+        template <class F, class N, class... Ts>
+        __device__ void inner_void_impl(F f, N n, Ts&&... xs) const
+        {
+            idx.local_stride(n, [&](auto j, auto d) { f(xs(j, d)...); });
+        }
+        template <class R, class F, class N, class... Ts>
+        __device__ auto inner_impl(F f, N n, Ts&&... xs) const
+        {
+            using max_iterations = decltype(idx.max_local_stride_iterations(n));
+            inner_storage<R, max_iterations{}, N> storage;
+            idx.local_stride(n, [&](auto j, auto d) { storage(j, d) = f(xs(j, d)...); });
+            return storage;
+        }
    };
    template <class Slicer>
    static __device__ auto make(index idx, Slicer slicer)
    {
-        return reducer<Slicer>{idx, slicer};
+        return reducer<Slicer>{{}, idx, slicer};
    }
    template <class Output, class F>
@@ -254,56 +388,143 @@ struct block
    }
 };
-struct lane
+struct block_large
 {
    template <class Slicer>
-    struct reducer
+    struct reducer : reducer_base<reducer<Slicer>>
    {
        index idx;
        Slicer slice;
-        template <class Op, class T, class Read>
-        __device__ auto reduce(Op op, T init, Read read) const
+        template <class Size, class F>
+        struct inner_storage : inner_storage_tag
        {
-            return sliced(slice, [=](auto x, auto... xs) {
+            using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
-                using type = typename decltype(x)::type;
+            F f;
-                type r     = init;
+            constexpr Size rsize() const { return {}; }
-                for(index_int j = 0; j < x.get_shape().elements(); j++)
+            template <class U, class V>
+            constexpr auto operator()(U j, V d) const
            {
-                    r = op(r, read(x[j], xs[j]...));
+                return f(j, d);
            }
-                return r;
+        };
+        template <class Size, class F>
+        constexpr inner_storage<Size, F> make_inner_storage(Size, F f)
+        {
+            return {f};
+        }
+        template <class Op, class T, class Read, class N, class... Ts>
+        __device__ auto reduce_impl(Op op, T init, Read read, N n, Ts&&... xs) const
+        {
+            return block_reduce(idx, op, init, index_int{n}, [&](auto j, auto d) {
+                return vec_reduce(read(xs(j, d)...), op);
            });
        }
        template <class F>
        __device__ void outer(F f) const
        {
+            if(idx.local == 0)
                f();
        }
-        template <class F>
+        template <class F, class N, class... Ts>
-        __device__ auto inner(F f) const
+        __device__ void inner_void_impl(F f, N n, Ts&&... xs) const
        {
-            return sliced(slice, [=](auto x, auto... xs) {
+            idx.local_stride(index_int{n}, [&](auto j, auto d) { f(xs(j, d)...); });
-                for(index_int j = 0; j < x.get_shape().elements(); j++)
+        }
+        template <class R, class F, class N, class... Ts>
+        __device__ auto inner_impl(F f, N n, Ts&&... xs) const
+        {
+            return make_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
+        }
+    };
+    template <class Slicer>
+    static __device__ auto make(index idx, Slicer slicer)
    {
-                    f(x[j], xs[j]...);
+        return reducer<Slicer>{{}, idx, slicer};
    }
+    template <class Output, class F>
+    static __device__ void run(F f)
+    {
+        auto idx                 = make_index();
+        constexpr auto nelements = get_shape_c<Output>{}.elements();
+        idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
+            const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
+            f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
        });
    }
+};
-        template <class Input>
+struct lane
-        constexpr auto elements() const
+{
+    template <class Slicer>
+    struct reducer : reducer_base<reducer<Slicer>>
    {
-            using reduce_type = decltype(slice(Input{}));
+        index idx;
-            return get_shape_c<reduce_type>{}.elements();
+        Slicer slice;
+        template <class Size, class F>
+        struct inner_storage : inner_storage_tag
+        {
+            using type = remove_reference_t<decltype(declval<F>()(0, _c<0>))>;
+            F f;
+            constexpr Size rsize() const { return {}; }
+            template <class U, class V>
+            constexpr auto operator()(U j, V d) const
+            {
+                return f(j, d);
            }
        };
+        template <class Size, class F>
+        constexpr inner_storage<Size, F> make_inner_storage(Size, F f)
+        {
+            return {f};
+        }
+        template <class Op, class T, class Read, class N, class U, class... Us>
+        __device__ auto reduce_impl(Op op, T init, Read read, N n, U&& x, Us&&... xs) const
+        {
+            using type = remove_reference_t<decltype(x(0, _c<0>))>;
+            type r     = init;
+            for(index_int j = 0; j < n; j++)
+            {
+                r = op(r, read(x(j, _c<0>), xs(j, _c<0>)...));
+            }
+            return r;
+        }
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            f();
+        }
+        template <class F, class N, class... Ts>
+        __device__ void inner_void_impl(F f, N n, Ts&&... xs) const
+        {
+            for(index_int j = 0; j < n; j++)
+            {
+                f(xs(j, _c<0>)...);
+            }
+        }
+        template <class R, class F, class N, class... Ts>
+        __device__ auto inner_impl(F f, N n, Ts&&... xs) const
+        {
+            return make_inner_storage(n, [=](auto j, auto d) { return f(xs(j, d)...); });
+        }
+    };
    template <class Slicer>
    static __device__ auto make(index idx, Slicer slicer)
    {
-        return reducer<Slicer>{idx, slicer};
+        return reducer<Slicer>{{}, idx, slicer};
    }
    template <class Output, class F>
@@ -318,6 +539,26 @@ struct lane
    }
 };
+// TODO: Remove these in the future when they can be selected in the compiler class
+template <index_int RElements>
+constexpr auto pick_block()
+{
+    using nlocal = decltype(index{}.max_nlocal());
+    if constexpr(RElements < nlocal{} * 256)
+        return block{};
+    else
+        return block_large{};
+}
+template <index_int RElements>
+using auto_block = decltype(pick_block<RElements>());
+template <class Input, index_int Axis>
+constexpr auto reduce_elements_with_axis()
+{
+    constexpr auto s = get_shape_c<Input>{};
+    return s.lens[Axis];
+}
 } // namespace reduce
 template <class Algo,

--- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -76,14 +76,6 @@ struct shape
    constexpr index_int index(index_array x) const { return x.dot(strides); }
-    constexpr index_int index(std::initializer_list<index_int> x) const
-    {
-        index_int idx = 0;
-        for(index_int i = 0; i < x.size(); i++)
-            idx += *(x.begin() + i) * strides[i];
-        return idx;
-    }
    constexpr index_int index(index_int i) const
    {
        if(this->standard())

--- a/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
@@ -30,18 +30,20 @@
 namespace migraphx {
 template <index_int Axis, class Input, class Output>
-__device__ void softmax(Input input, Output output)
+__device__ void softmax(Input input1, Output output)
 {
-    reduce::block::run<reduce::with_axis<Input, Axis>>([&](auto, auto r) {
+    using block = reduce::auto_block<reduce::reduce_elements_with_axis<Input, Axis>()>;
+    block::template run<reduce::with_axis<Input, Axis>>([&](auto, auto r) {
+        auto input = r.inner(op::id{})(input1);
 #ifdef MIGRAPHX_USE_FAST_SOFTMAX
-        const auto c = vec_at(r.slice(input)[0], 0);
+        const auto c = vec_at(r.slice(input1)[0], 0);
 #else
        const auto c = r.reduce(op::max{}, lowest{}, op::id{})(input);
 #endif
-        auto batch_sum = r.reduce(op::sum{}, 0, [&](auto x) {
+        auto exp_in = r.inner([&](auto x) { return migraphx::exp(x - c); })(input);
-            return migraphx::convert<float>(migraphx::exp(x - c));
+        auto batch_sum =
-        })(input);
+            r.reduce(op::sum{}, 0, [](auto x) { return migraphx::convert<float>(x); })(exp_in);
-        r.inner([&](auto& y, auto x) { y = migraphx::exp(x - c) / batch_sum; })(output, input);
+        r.inner([&](auto& y, auto x) { y = x / batch_sum; })(output, exp_in);
    });
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -141,6 +141,25 @@ MIGRAPHX_BUILTIN_TYPE_TRAITN(is_constructible);
 MIGRAPHX_BUILTIN_TYPE_TRAITN(is_nothrow_constructible);
 MIGRAPHX_BUILTIN_TYPE_TRAITN(is_trivially_constructible);
+template <class T>
+struct remove_cv
+{
+    using type = T;
+};
+template <class T>
+struct remove_cv<const T> : remove_cv<T>
+{
+};
+template <class T>
+struct remove_cv<volatile T> : remove_cv<T>
+{
+};
+template <class T>
+using remove_cv_t = typename remove_cv<T>::type;
 template <class T>
 struct remove_reference
 {
@@ -168,6 +187,11 @@ struct add_pointer : type_identity<typename remove_reference<T>::type*>
 template <class T>
 using add_pointer_t = typename add_pointer<T>::type;
+template <class T>
+struct is_void : is_same<void, remove_cv_t<T>>
+{
+};
 template <class... Ts>
 struct common_type;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -28,8 +28,45 @@
 namespace migraphx {
-using index_int = std::uint32_t;
+#if defined(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS) and defined(MIGRAPHX_USE_HIPRTC)
-using diff_int  = std::int32_t;
+using int8_t   = signed char;
+using uint8_t  = unsigned char;
+using int16_t  = signed short;
+using uint16_t = unsigned short;
+using int32_t  = signed int;
+using uint32_t = unsigned int;
+using int64_t  = signed long long;
+using uint64_t = unsigned long long;
+#elif defined(MIGRAPHX_USE_HIPRTC)
+using int8_t   = __hip_int8_t;
+using uint8_t  = __hip_uint8_t;
+using int16_t  = __hip_int16_t;
+using uint16_t = __hip_uint16_t;
+using int32_t  = __hip_int32_t;
+using uint32_t = __hip_uint32_t;
+using int64_t  = __hip_int64_t;
+using uint64_t = __hip_uint64_t;
+#else
+using int8_t   = std::int8_t;
+using uint8_t  = std::uint8_t;
+using int16_t  = std::int16_t;
+using uint16_t = std::uint16_t;
+using int32_t  = std::int32_t;
+using uint32_t = std::uint32_t;
+using int64_t  = std::int64_t;
+using uint64_t = std::uint64_t;
+#endif // MIGRAPHX_USE_HIPRTC
+using index_int = uint32_t;
+using diff_int  = int32_t;
+static_assert(sizeof(int8_t) == 1, "int8_t must be 1 bytes");
+static_assert(sizeof(uint8_t) == 1, "uint8_t must be 1 bytes");
+static_assert(sizeof(int16_t) == 2, "int16_t must be 2 bytes");
+static_assert(sizeof(uint16_t) == 2, "uint16_t must be 2 bytes");
+static_assert(sizeof(int32_t) == 4, "int32_t must be 4 bytes");
+static_assert(sizeof(uint32_t) == 4, "uint32_t must be 4 bytes");
+static_assert(sizeof(int64_t) == 8, "int64_t must be 8 bytes");
+static_assert(sizeof(uint64_t) == 8, "uint64_t must be 8 bytes");
 #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -83,7 +83,6 @@ struct miopen_apply
        auto& ctx      = get_context();
        int8_x4_format = get_int8_x4_format(ctx);
        compute_fp32   = get_compute_fp32_flag();
        offload_copy   = (mod->name() == "main") ? pass->offload_copy : false;
        add_generic_op("contiguous");
@@ -112,6 +111,7 @@ struct miopen_apply
        add_loop_op();
        add_neg_op();
        add_nms_op();
+        add_select_module_op();
    }
    void copy_params() const
@@ -359,6 +359,33 @@ struct miopen_apply
            return mod->replace_instruction(ins, gpu_out);
        });
    }
+    /**
+     * Turns on use_local_alloc in the select_module submodules.
+     * Changes the submodule returns to a hip::sync_stream.
+     */
+    void add_select_module_op()
+    {
+        apply_map.emplace("select_module", [=](instruction_ref ins) {
+            std::vector<instruction_ref> inputs = ins->inputs();
+            auto mod_args                       = ins->module_inputs();
+            for(auto* smod : mod_args)
+            {
+                smod->use_local_alloc = true;
+                auto last_ins         = std::prev(smod->end());
+                if(last_ins->name() == "@return")
+                {
+                    for(auto out_ins : last_ins->inputs())
+                    {
+                        auto sync_out = smod->insert_instruction(
+                            last_ins, make_op("hip::sync_stream"), out_ins);
+                        smod->replace_return({sync_out});
+                    }
+                }
+            }
+            return ins;
+        });
+    }
 };
 void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -26,6 +26,8 @@
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/register_op.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/dead_code_elimination.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -90,7 +92,9 @@ struct find_layernorm
    {
        auto ins   = r.result;
        auto x_ins = r.instructions["x"];
-        auto eps   = r.instructions["eps"]->eval().at<float>();
+        float eps  = 0;
+        if(contains(r.instructions, "eps"))
+            eps = r.instructions["eps"]->eval().at<float>();
        m.replace_instruction(ins, layernorm{eps}, x_ins);
    }
@@ -100,23 +104,26 @@ struct find_add_layernorm
 {
    auto matcher() const
    {
-        return match::layernorm()(match::var("x")(match::name("add").bind("add")));
+        return match::name("gpu::prelayernorm")(
+            match::args(match::name("add")(match::used_once()).bind("add")));
    }
    void apply(module& m, const match::matcher_result& r) const
    {
        auto ins     = r.result;
        auto add_ins = r.instructions["add"];
-        auto eps     = r.instructions["eps"]->eval().at<float>();
+        auto op      = any_cast<layernorm>(ins->get_operator());
-        m.replace_instruction(ins, add_layernorm{eps}, add_ins->inputs());
+        m.replace_instruction(ins, add_layernorm{op.epsilon}, add_ins->inputs());
    }
 };
 } // namespace
-void prefuse_ops::apply(module& m) const
+void prefuse_ops::apply(module_pass_manager& mpm) const
 {
-    match::find_matches(m, find_add_layernorm{}, find_layernorm{});
+    match::find_matches(mpm.get_module(), find_layernorm{});
+    mpm.run_pass(dead_code_elimination{});
+    match::find_matches(mpm.get_module(), find_add_layernorm{});
 }
 } // namespace gpu

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -38,6 +38,7 @@
 #include <migraphx/layout_nhwc.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/normalize_ops.hpp>
+#include <migraphx/optimize_module.hpp>
 #include <migraphx/preallocate_param.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
@@ -50,6 +51,7 @@
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
+#include <migraphx/split_single_dyn_dim.hpp>
 #include <migraphx/gpu/allocation_model.hpp>
 #include <migraphx/gpu/compile_miopen.hpp>
 #include <migraphx/gpu/compile_ops.hpp>
@@ -90,6 +92,7 @@ pass enable_pass(bool enabled, pass p)
 std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options& options) const
 {
    auto& ctx = any_cast<context>(gctx);
+    ctx.set_exhaustive_tune_flag(options.exhaustive_tune);
    std::set<shape::type_t> unsupported_types(shape::types().begin(), shape::types().end());
    unsupported_types.erase(shape::type_t::float_type);
    unsupported_types.erase(shape::type_t::half_type);
@@ -100,6 +103,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    // clang-format off
    return
    {
+        split_single_dyn_dim{},
+        dead_code_elimination{},
        normalize_ops{},
        dead_code_elimination{},
        simplify_qdq{},
@@ -118,21 +123,13 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        rewrite_pooling{},
        dead_code_elimination{},
        rewrite_gelu{},
-        dead_code_elimination{},
+        optimize_module{},
-        eliminate_common_subexpression{},
-        dead_code_elimination{},
-        simplify_algebra{},
-        simplify_reshapes{},
        enable_pass(enabled(MIGRAPHX_ENABLE_NHWC{}), layout_nhwc{}),
        dead_code_elimination{},
-        simplify_reshapes{},
-        simplify_algebra{},
        prefuse_ops{},
        dead_code_elimination{},
        auto_contiguous{},
-        simplify_reshapes{},
+        optimize_module{},
-        propagate_constant{},
-        dead_code_elimination{},
        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
        dead_code_elimination{},
        fuse_mlir{&ctx},

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
-#####################################################################################
+# ####################################################################################
 # The MIT License (MIT)
 #
 # Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
@@ -20,7 +20,7 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
-#####################################################################################
+# ####################################################################################
 cmake_policy(SET CMP0057 NEW)
@@ -49,9 +49,11 @@ function(add_test_command NAME EXE)
        set_tests_properties(${NAME} PROPERTIES DISABLED On)
    elseif(WIN32)
        set(WINPATH)
        foreach(PATH ${CMAKE_FIND_ROOT_PATH})
            list(APPEND WINPATH ${PATH}/bin)
        endforeach()
        file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/test_${NAME}.cmd"
            CONTENT "set PATH=${WINPATH};%PATH%
                    %1 ${ARGN}")
@@ -67,9 +69,11 @@ function(add_test_command NAME EXE)
            # --args $<TARGET_FILE:${EXE}> ${ARGN})
            set(TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/gdb/test_${NAME})
            file(MAKE_DIRECTORY ${TEST_DIR})
-            if (NOT EXISTS ${TEST_DIR})
+            if(NOT EXISTS ${TEST_DIR})
                message(FATAL_ERROR "Failed to create test directory: ${TEST_DIR}")
            endif()
            file(GENERATE OUTPUT "${TEST_DIR}/run.cmake"
                CONTENT "
                # Remove previous core dump
@@ -90,22 +94,27 @@ function(add_test_command NAME EXE)
            add_test(NAME ${NAME} COMMAND ${EXE} ${ARGN})
        endif()
    endif()
    set_tests_properties(${NAME} PROPERTIES FAIL_REGULAR_EXPRESSION "FAILED")
 endfunction()
 function(add_test_executable TEST_NAME)
-    add_executable (${TEST_NAME} EXCLUDE_FROM_ALL ${ARGN})
+    add_executable(${TEST_NAME} EXCLUDE_FROM_ALL ${ARGN})
    target_link_libraries(${TEST_NAME} ${CMAKE_THREAD_LIBS_INIT})
    # Cmake does not add flags correctly for gcc
    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
        set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS -pthread LINK_FLAGS -pthread)
    endif()
    separate_arguments(MIOPEN_TEST_FLAGS_ARGS UNIX_COMMAND ${MIOPEN_TEST_FLAGS})
    if(MIOPEN_TEST_ALL)
        set(TEST_COMMAND ${TEST_NAME} ${MIOPEN_TEST_FLOAT_ARG} --all ${MIOPEN_TEST_FLAGS_ARGS})
    else()
        set(TEST_COMMAND ${TEST_NAME} ${MIOPEN_TEST_FLOAT_ARG} ${MIOPEN_TEST_FLAGS_ARGS})
    endif()
    add_test_command(${TEST_NAME} ${TEST_COMMAND})
    add_dependencies(tests ${TEST_NAME})
    add_dependencies(check ${TEST_NAME})
@@ -133,7 +142,7 @@ if(MIGRAPHX_ENABLE_GPU)
            COST 10
            RESOURCE_LOCK gpu
        )
-        target_link_libraries(test_gpu_${BASE_NAME} migraphx_gpu)
+        target_link_libraries(test_gpu_${BASE_NAME} migraphx_gpu migraphx_kernels)
    endforeach()
 endif()
@@ -155,7 +164,8 @@ endif()
 # Onnx test
 set(TEST_ONNX_DIR ${CMAKE_CURRENT_SOURCE_DIR}/onnx)
-file (GLOB ONNX_TESTS ${TEST_ONNX_DIR}/*.cpp)
+file(GLOB ONNX_TESTS ${TEST_ONNX_DIR}/*.cpp)
 foreach(ONNX_TEST ${ONNX_TESTS})
    get_filename_component(BASE_NAME ${ONNX_TEST} NAME_WE)
    set(TEST_NAME test_${BASE_NAME})
@@ -180,12 +190,12 @@ add_dependencies(check test_tf)
 add_subdirectory(api)
 add_subdirectory(verify)
 if(MIGRAPHX_ENABLE_PYTHON)
-add_subdirectory(py)
+    add_subdirectory(py)
 endif()
 function(test_header NAME HEADER)
    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/header-main-include-${NAME}.cpp
        "#include <${HEADER}>\nint main() {}\n"
    )
@@ -206,6 +216,7 @@ function(test_headers PREFIX)
        string(MAKE_C_IDENTIFIER ${HEADER_REL} TEST_NAME)
        get_filename_component(BASE_NAME ${HEADER} NAME_WE)
        test_header(header_${TEST_NAME} ${PREFIX}/${BASE_NAME}.hpp)
        if(MIGRAPHX_ENABLE_GPU)
            target_link_libraries(header_${TEST_NAME} migraphx_gpu)
        endif()
@@ -214,6 +225,7 @@ endfunction()
 test_headers(migraphx ${CMAKE_SOURCE_DIR}/src/include/migraphx/*.hpp)
 test_headers(migraphx/ref ${CMAKE_SOURCE_DIR}/src/targets/ref/include/migraphx/ref/*.hpp)
 if(MIGRAPHX_ENABLE_GPU)
-test_headers(migraphx/gpu ${CMAKE_SOURCE_DIR}/src/targets/gpu/include/migraphx/gpu/*.hpp)
+    test_headers(migraphx/gpu ${CMAKE_SOURCE_DIR}/src/targets/gpu/include/migraphx/gpu/*.hpp)
 endif()
--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -35,6 +35,7 @@ TEST_CASE(load_and_run)
    auto shapes_before = p.get_output_shapes();
    migraphx::compile_options options;
    options.set_offload_copy();
+    options.set_exhaustive_tune_flag();
    p.compile(migraphx::target("gpu"), options);
    auto shapes_after = p.get_output_shapes();
    CHECK(shapes_before.size() == 1);

--- a/test/api/test_save_load.cpp
+++ b/test/api/test_save_load.cpp
@@ -30,7 +30,6 @@ TEST_CASE(load_save_default)
    std::string filename = "migraphx_api_load_save.mxr";
    auto p1              = migraphx::parse_onnx("conv_relu_maxpool_test.onnx");
    auto s1              = p1.get_output_shapes();
    migraphx::save(p1, filename.c_str());
    auto p2 = migraphx::load(filename.c_str());
    auto s2 = p2.get_output_shapes();

--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -35,13 +35,14 @@
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
 #include <migraphx/gpu/compiler.hpp>
+#include <migraphx_kernels.hpp>
 // NOLINTNEXTLINE
 const std::string write_2s = R"__migraphx__(
 #include <hip/hip_runtime.h>
 extern "C" {
-__global__ void write(int8_t* data) 
+__global__ void write(char* data) 
 {
    int num = threadIdx.x + blockDim.x * blockIdx.x;
    data[num] = 2;
@@ -58,7 +59,7 @@ const std::string add_2s_binary = R"__migraphx__(
 #include <hip/hip_runtime.h>
 extern "C" {
-__global__ void add_2(std::int8_t* x, std::int8_t* y) 
+__global__ void add_2(char* x, char* y) 
 {
    int num = threadIdx.x + blockDim.x * blockIdx.x;
    y[num] = x[num] + 2;
@@ -137,7 +138,8 @@ int main() {}
 const std::string math_template = R"__migraphx__(
 #include <migraphx/kernels/pointwise.hpp>
 #include <migraphx/kernels/math.hpp>
+#include <migraphx/kernels/types.hpp>
+using namespace migraphx;
 extern "C" {
 __global__ void kernel(${type}* p) 
 {

--- a/test/memory_coloring_test.cpp
+++ b/test/memory_coloring_test.cpp
@@ -691,7 +691,7 @@ TEST_CASE(test38)
    auto p83    = m.add_instruction(pass_op{}, p78, p77);
    m.add_instruction(pass_op{}, output, p83, p63);
    run_pass(m);
-    CHECK(m.get_parameter_shape("scratch").bytes() == 7225344); // Optimal solution is 6422528
+    CHECK(m.get_parameter_shape("scratch").bytes() == 6422528);
    CHECK(no_allocate(m));
 }
@@ -729,7 +729,7 @@ TEST_CASE(test39)
        run_pass(*smod);
    }
-    CHECK(mm->get_parameter_shape("scratch").bytes() == 4);
+    CHECK(mm->get_parameter_shape("scratch").bytes() == 1);
    CHECK(then_mod->get_parameter_shape("scratch").bytes() == 24);
    CHECK(else_mod->get_parameter_shape("scratch").bytes() == 24);
    CHECK(no_allocate(*mm));
@@ -3374,7 +3374,7 @@ TEST_CASE(rnn_dom)
    m.add_instruction(pass_op{}, moutput, mx250, mx249, mx248);
    run_pass(m);
-    CHECK(m.get_parameter_shape("scratch").bytes() == 1600);
+    CHECK(m.get_parameter_shape("scratch").bytes() == 1824); // Optimal is 1600
    CHECK(no_allocate(m));
    CHECK(is_disjoint({mx0, mx8}));
    CHECK(is_disjoint({mx0, mx8}));
@@ -3790,4 +3790,23 @@ TEST_CASE(literal_test)
    CHECK(lit == result);
 }
+TEST_CASE(test_tuple)
+{
+    migraphx::module m;
+    auto s1 = migraphx::shape{migraphx::shape::float_type, {8}};
+    auto s2 = migraphx::shape{migraphx::shape::half_type, {10}};
+    auto s = migraphx::shape{{s1, s2}};
+    auto a1 = add_alloc(m, s);
+    auto m1 = m.add_instruction(pass_op{}, a1);
+    auto a2 = add_alloc(m, {migraphx::shape::float_type, {4}});
+    m.add_instruction(pass_op{}, a2, m1);
+    run_pass(m);
+    CHECK(m.get_parameter_shape("scratch").bytes() == 68);
+    CHECK(no_allocate(m));
+    CHECK(is_disjoint({a1, a2}));
+}
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/onnx/.onnxrt-commit
+++ b/test/onnx/.onnxrt-commit
+c9a53c925510a101f5ca94d5ecda0924e40a8463