Merge

7e297b13 · Paul · 86ea5e91 · aa7ff911 · 7e297b13 · 7e297b13
Commit 7e297b13 authored Jun 13, 2022 by Paul
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -3,31 +3,52 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/math.hpp>
 #include <migraphx/kernels/preload.hpp>
 #include <migraphx/kernels/vectorize.hpp>
 #include <migraphx/kernels/args.hpp>
 namespace migraphx {
+template <class T>
+struct implicit_conversion_op
+{
+    T x;
+    template <index_int N, class U>
+    constexpr operator vec<U, N>() const
+    {
+        static_assert(vec_size<T>() == N, "Vector mismatch size");
+        return __builtin_convertvector(x, vec<U, N>);
+    }
+    template <class U>
+    constexpr operator U() const
+    {
+        return x;
+    }
+};
+template <class T>
+constexpr implicit_conversion_op<T> implicit_conversion(T x)
+{
+    return {x};
+}
 template <class F, class T, class... Ts>
 __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 {
-    preload<typename T::type>(idx, xs...)([&](auto... ps) {
+    idx.global_stride(out.get_shape().elements(),
-        idx.global_stride(out.get_shape().elements(), [&](auto i) {
+                      [&](auto i) { out[i] = implicit_conversion(f(xs[i]...)); });
-            auto multi_idx = out.get_shape().multi(i);
-            out[multi_idx] = f(ps[multi_idx]...);
-        });
-    });
 }
-template <class F, class... Ts>
+template <class... Transforms>
-__device__ void pointwise(F f, Ts*... ps)
+__device__ auto pointwise(index idx, Transforms... transforms)
 {
-    auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize());
+    return [=](auto f, auto*... ps) {
-    t(ps...)([&](auto... xs) {
+        auto t = transform_args(make_tensors(), rotate_last(), transforms...);
-        auto idx = make_index();
+        t(ps...)([&](auto... xs) { pointwise_tensor(idx, f, xs...); });
-        pointwise_tensor(idx, f, xs...);
+    };
-    });
 }
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
@@ -3,16 +3,25 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/vec.hpp>
 namespace migraphx {
-template <class Shape>
+template <class T>
-constexpr bool is_preloadable()
+struct remove_vec_impl
 {
-    Shape s{};
+    using type = T;
-    if(not s.broadcasted())
+};
-        return false;
-}
+template <class T, index_int N>
+struct remove_vec_impl<vec<T, N>>
+{
+    using type = T;
+};
+template <class T>
+using remove_vec = typename remove_vec_impl<T>::type;
 template <class T, class... Shapes>
 constexpr auto traverse_preload(Shapes... ss)
@@ -20,11 +29,11 @@ constexpr auto traverse_preload(Shapes... ss)
    return [=](auto f, auto... g) {
        index_int offset = 0;
        auto each        = [&](auto x) {
+            using type          = remove_vec<typename decltype(x)::type>;
            constexpr auto s    = decltype(x.get_shape()){};
-            constexpr auto size = _c<s.element_space()>;
+            constexpr auto size = s.element_space();
-            if constexpr(not s.broadcasted())
+            if constexpr(not s.broadcasted() or (s.elements() - size) < 64 or
-                return f(x, offset, false_type{});
+                         not is_same<T, type>{})
-            else if constexpr((s.elements() - size) < 64)
                return f(x, offset, false_type{});
            else
            {
@@ -39,7 +48,7 @@ constexpr auto traverse_preload(Shapes... ss)
 }
 template <class T, class... Shapes>
-constexpr index_int compute_preload_size(Shapes...)
+constexpr index_int compute_preload_size_c(Shapes...)
 {
    index_int size = 0;
    traverse_preload<T>(Shapes{}...)(
@@ -47,6 +56,12 @@ constexpr index_int compute_preload_size(Shapes...)
    return size;
 }
+template <class T, class... Shapes>
+constexpr auto compute_preload_size(Shapes...)
+{
+    return _c<compute_preload_size_c<T>(Shapes{}...)>;
+}
 template <class F, class T, class... Ts>
 __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
 {
@@ -58,11 +73,21 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
        [&](auto x, auto offset, auto copy) {
            if constexpr(copy)
            {
-                auto v = vectorize(x);
+                if constexpr(decltype(tensor_vec_size(x)){} == 0)
-                auto b = as_vec(tensor_vec_size(v), buffer + offset);
+                {
-                idx.local_stride(v.get_shape().element_space(),
+                    auto v = auto_vectorize(x);
-                                 [&](auto i) { b[i] = v.data()[i]; });
+                    auto b = as_vec(tensor_vec_size(v), buffer + offset);
-                return x.with(buffer + offset);
+                    idx.local_stride(v.get_shape().element_space(),
+                                     [&](auto i) { b[i] = v.data()[i]; });
+                    return x.with(buffer + offset);
+                }
+                else
+                {
+                    auto b = as_vec(tensor_vec_size(x), buffer + offset);
+                    idx.local_stride(x.get_shape().element_space(),
+                                     [&](auto i) { b[i] = x.data()[i]; });
+                    return x.with(b);
+                }
            }
            else
            {
@@ -72,23 +97,23 @@ __device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
        invoke);
 }
-template <class T>
+template <class T, class Shape>
-struct remove_vec
+struct shape_type : Shape
 {
    using type = T;
 };
-template <class T, index_int N>
+template <class T>
-struct remove_vec<vec<T, N>>
+constexpr auto make_shape_type(T)
 {
-    using type = T;
+    return shape_type<typename T::type, typename T::shape_type>{};
-};
+}
 template <class T, class... Ts>
 __device__ auto preload(index idx, Ts... xs)
 {
-    using type               = typename remove_vec<T>::type;
+    using type               = remove_vec<T>;
-    constexpr auto size      = compute_preload_size<type>(xs.get_shape()...);
+    constexpr auto size      = decltype(compute_preload_size<type>(make_shape_type(xs)...)){};
    const index_int max_size = 512 * sizeof(type);
    return [=](auto f) {
        if constexpr(size > 0 and size < max_size)
@@ -103,5 +128,47 @@ __device__ auto preload(index idx, Ts... xs)
    };
 }
+inline __device__ auto auto_preload(index idx)
+{
+    return make_transform([=](auto f, auto out, auto... xs) {
+        preload<typename decltype(out)::type>(idx, xs...)([&](auto... ys) { f(out, ys...); });
+    });
+}
+template <bool B, class T>
+__device__ auto preload_copy(index idx, T x)
+{
+    return [=](auto f) {
+        if constexpr(B)
+        {
+            using type          = typename T::type;
+            constexpr auto size = get_shape_c<T>{}.element_space();
+            __shared__ type buffer[size];
+            // TODO: Always vecotrize when size > 4, and then use a second loop for remainder
+            constexpr auto n = find_vectorize_size([&](auto i) { return (size % i) == 0; });
+            auto input       = as_vec<n>(remove_bool(x.data()));
+            auto b           = as_vec<n>(remove_bool(buffer));
+            idx.local_stride(size / n, [&](auto i) { b[i] = input[i]; });
+            return f(x.with(buffer));
+        }
+        else
+        {
+            return f(x);
+        }
+    };
+}
+template <bool... Bs>
+__device__ auto auto_preload(index idx)
+{
+    return make_transform([=](auto f, auto... xs) {
+        auto invoke = [=](auto... ys) {
+            __syncthreads();
+            f(ys...);
+        };
+        join(invoke, preload_copy<Bs>(idx, xs)...);
+    });
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
 #ifndef MIGRAPHX_GUARD_KERNELS_PRINT_HPP
 #define MIGRAPHX_GUARD_KERNELS_PRINT_HPP
-#include <hip/hip_runtime.h>
+#include <migraphx/kernels/hip.hpp>
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/functional.hpp>
 #include <migraphx/kernels/algorithm.hpp>
@@ -140,6 +140,10 @@ struct basic_printer
    {
        return print_ulong(value);
    }
+    __host__ __device__ const basic_printer& operator<<(migraphx::half value) const
+    {
+        return print_double(value);
+    }
    __host__ __device__ const basic_printer& operator<<(float value) const
    {
        return print_double(value);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
+#define MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
+#include <migraphx/kernels/dpp.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/ops.hpp>
+namespace migraphx {
+#if MIGRAPHX_HAS_DPP
+template <class T, class Op>
+__device__ void dpp_reduce(T& in, Op op)
+{
+    T out{};
+    out = dpp_mov<dpp_row_shr(1)>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(2)>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(4), 0xf, 0xe>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_shr(8), 0xf, 0xc>(in);
+    in  = op(in, out);
+#if __AMDGCN_WAVEFRONT_SIZE == 64
+    out = dpp_mov<dpp_row_bcast(15), 0xa>(in);
+    in  = op(in, out);
+    out = dpp_mov<dpp_row_bcast(31), 0xc>(in);
+    in  = op(in, out);
+#endif
+}
+#if defined(MIGRAPHX_USE_CLANG_TIDY) || defined(CPPCHECK)
+// NOLINTNEXTLINE
+#define MIGRAPHX_DPP_REDUCE_ASM(x, ins) x = 1
+#elif __AMDGCN_WAVEFRONT_SIZE == 64
+#define MIGRAPHX_DPP_REDUCE_ASM(x, ins)                                       \
+    __asm__ volatile("s_nop 4\n" #ins " %0 %0 %0 row_shr:1\n"                 \
+                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:2\n"                 \
+                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:4 bank_mask:0xe\n"   \
+                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:8 bank_mask:0xc\n"   \
+                     "s_nop 1\n" #ins " %0 %0 %0 row_bcast:15 row_mask:0xa\n" \
+                     "s_nop 1\n" #ins " %0 %0 %0 row_bcast:31 row_mask:0xc\n" \
+                     "s_nop 1\n"                                              \
+                     : "=v"(x)                                                \
+                     : "0"(x))
+#else
+#define MIGRAPHX_DPP_REDUCE_ASM(x, ins)                                     \
+    __asm__ volatile("s_nop 4\n" #ins " %0 %0 %0 row_shr:1\n"               \
+                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:2\n"               \
+                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:4 bank_mask:0xe\n" \
+                     "s_nop 1\n" #ins " %0 %0 %0 row_shr:8 bank_mask:0xc\n" \
+                     "s_nop 1\n"                                            \
+                     "s_nop 1\n"                                            \
+                     : "=v"(x)                                              \
+                     : "0"(x))
+#endif
+// NOLINTNEXTLINE
+#define MIGRAPHX_DPP_REDUCE(op, prefix)                                                            \
+    __device__ inline void dpp_reduce(double& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f64); } \
+    __device__ inline void dpp_reduce(float& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f32); }  \
+    __device__ inline void dpp_reduce(half& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_f16); }   \
+    __device__ inline void dpp_reduce(int32_t& x, op)                                              \
+    {                                                                                              \
+        MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32);                                                  \
+    }                                                                                              \
+    __device__ inline void dpp_reduce(uint32_t& x, op) { MIGRAPHX_DPP_REDUCE_ASM(x, prefix##_u32); }
+MIGRAPHX_DPP_REDUCE(op::sum, v_add)
+MIGRAPHX_DPP_REDUCE(op::max, v_max)
+MIGRAPHX_DPP_REDUCE(op::min, v_min)
+MIGRAPHX_DPP_REDUCE(op::product, v_mul)
+template <class Op, class T, class F>
+__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
+{
+#if __AMDGCN_WAVEFRONT_SIZE == 32
+    constexpr index_int lanes_per_thread = 16;
+#else
+    constexpr index_int lanes_per_thread = 64;
+#endif
+    using type = decltype(f(0));
+    __shared__ type buffer[idx.nlocal() / lanes_per_thread];
+    type x = init;
+    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
+    dpp_reduce(x, op);
+    const auto ldsidx = idx.local / lanes_per_thread;
+    if((idx.local % lanes_per_thread) == lanes_per_thread - 1)
+    {
+        buffer[ldsidx] = x;
+    }
+    __syncthreads();
+    type y = init;
+    for(index_int i = 0; i < idx.nlocal() / lanes_per_thread; i++)
+    {
+        y = op(y, buffer[i]);
+    }
+    return y;
+}
+#else
+template <class Op, class T, class F>
+__device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
+{
+    using type = decltype(f(0));
+    __shared__ type buffer[idx.nlocal()];
+    type x = init;
+    idx.local_stride(n, [&](auto i) { x = op(x, f(i)); });
+    buffer[idx.local] = x;
+    __syncthreads();
+    for(index_int s = 1; s < idx.nlocal(); s *= 2)
+    {
+        const index_int index = 2 * s * idx.local;
+        if(index + s < idx.nlocal())
+        {
+            buffer[index] = op(buffer[index], buffer[index + s]);
+        }
+        __syncthreads();
+    }
+    return buffer[0];
+}
+#endif
+template <class Output, class Input, class T>
+constexpr auto reduce_slice(Input input, T i)
+{
+    constexpr auto lens = transform(get_shape_c<Input>{}.lens,
+                                    get_shape_c<Output>{}.lens,
+                                    [](index_int x, index_int y) -> index_int {
+                                        if(x == y)
+                                            return 1;
+                                        return x;
+                                    });
+    ;
+    constexpr auto s = make_shape(lens, get_shape_c<Input>{}.strides);
+    MIGRAPHX_ASSERT((input.get_shape().index(i) + s.element_space()) <=
+                    input.get_shape().element_space());
+    return make_tensor_view(&input[i], s);
+}
+namespace reduce {
+template <class Slicer, class F>
+constexpr auto sliced(Slicer slicer, F f)
+{
+    return [=](auto x, auto... xs) {
+        // TODO: assert all elements are the same
+        return f(slicer(x), slicer(xs)...);
+    };
+}
+struct block
+{
+    template <class Slicer>
+    struct reducer
+    {
+        index idx;
+        Slicer slicer;
+        template <class Op, class T, class Read>
+        __device__ auto reduce(Op op, T init, Read read) const
+        {
+            return sliced(slicer, [=](auto x, auto... xs) {
+                return vec_reduce(block_reduce(idx,
+                                               op,
+                                               init,
+                                               x.get_shape().elements(),
+                                               [&](auto j) { return read(x[j], xs[j]...); }),
+                                  op);
+            });
+        }
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            if(idx.local == 0)
+                f();
+        }
+    };
+    template <class Slicer>
+    static __device__ auto make(index idx, Slicer slicer)
+    {
+        return reducer<Slicer>{idx, slicer};
+    }
+    template <class Output, class F>
+    static __device__ void run(F f)
+    {
+        auto idx                 = make_index();
+        constexpr auto nelements = get_shape_c<Output>{}.elements();
+        idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
+            const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
+            f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
+        });
+    }
+};
+struct lane
+{
+    template <class Slicer>
+    struct reducer
+    {
+        index idx;
+        Slicer slicer;
+        template <class Op, class T, class Read>
+        __device__ auto reduce(Op op, T init, Read read) const
+        {
+            return sliced(slicer, [=](auto x, auto... xs) {
+                using type = typename decltype(x)::type;
+                type r     = init;
+                for(index_int j = 0; j < x.get_shape().elements(); j++)
+                {
+                    r = op(r, read(x[j], xs[j]...));
+                }
+                return r;
+            });
+        }
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            f();
+        }
+    };
+    template <class Slicer>
+    static __device__ auto make(index idx, Slicer slicer)
+    {
+        return reducer<Slicer>{idx, slicer};
+    }
+    template <class Output, class F>
+    static __device__ void run(F f)
+    {
+        auto idx                 = make_index();
+        constexpr auto nelements = get_shape_c<Output>{}.elements();
+        idx.global_stride(nelements, [&](auto i) {
+            const auto out_idx = get_shape_c<Output>{}.multi(i);
+            f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
+        });
+    }
+};
+} // namespace reduce
+template <class Algo,
+          class Op,
+          class T,
+          class Input,
+          class Output,
+          class ReadInput,
+          class WriteOuput>
+__device__ void
+simple_reduce(Op op, T init, Input input, Output output, ReadInput read, WriteOuput write)
+{
+    Algo::template run<Output>([&](auto out_idx, auto r) {
+        auto x = r.reduce(op, init, read)(input);
+        r.outer([&] { output[out_idx] = write(x); });
+    });
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_REDUCE_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/dfor.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/math.hpp>
+#include <migraphx/kernels/array.hpp>
+namespace migraphx {
+struct max_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return lowest{}; }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return max(x, y);
+    }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int)
+    {
+        return (x);
+    }
+};
+struct avg_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return 0.0; }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return x + y;
+    }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, index_int y)
+    {
+        return (y == 0) ? 0.0 : (x / y);
+    }
+};
+template <class Iterator, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
+    const Iterator data, const array<index_int, 2>& dims, array<float, 2> xy, Op pooling)
+{
+    array<int, 2> low{};
+    array<int, 2> high{};
+    for(index_int ii = 0; ii < xy.size(); ++ii)
+    {
+        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
+        {
+            return 0;
+        }
+        xy[ii]   = migraphx::max(xy[ii], 0.0f);
+        low[ii]  = xy[ii];
+        high[ii] = low[ii] + 1;
+        if(low[ii] >= dims[ii] - 1)
+        {
+            xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+        }
+    }
+    array<index_int, 4> locs = {low[0] * dims[1] + low[1],
+                                low[0] * dims[1] + high[1],
+                                high[0] * dims[1] + low[1],
+                                high[0] * dims[1] + high[1]};
+    float ly                                   = xy[0] - low[0];
+    float lx                                   = xy[1] - low[1];
+    float hy                                   = 1.0f - ly;
+    float hx                                   = 1.0f - lx;
+    array<typename Iterator::value_type, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
+    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
+    return pooling(v01, v23);
+}
+template <class Iterator, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR auto calc_pooling(const Iterator& data,
+                                            const array<float, 2>& roi_starts,
+                                            const array<float, 2>& bin_size,
+                                            const array<int, 2>& idx,
+                                            const array<index_int, 2>& bin_grid_size,
+                                            const array<index_int, 2>& dims,
+                                            float roi_offset,
+                                            Op op)
+{
+    typename Iterator::value_type output_val = op.init();
+    const int64_t count                      = bin_grid_size[0] * bin_grid_size[1];
+    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
+        array<index_int, 2> id = {iy, ix};
+        array<float, 2> locs =
+            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;
+        auto val   = bilinear_interpolate(data, dims, locs, op);
+        output_val = op(output_val, val);
+    });
+    return op.final(output_val, count);
+}
+template <class T1, class T2, class T3, class T4>
+struct roalign_settings
+{
+    T1 roi_offset{};
+    T2 is_avg_pooling{};
+    T3 sampling_ratio{};
+    T4 spatial_scale{};
+};
+template <class... Ts>
+constexpr roalign_settings<Ts...> make_roalign_settings(Ts... xs)
+{
+    return {xs...};
+}
+template <class T, class U, class V, class W, class Settings>
+__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t, Settings s)
+{
+    auto index      = make_index();
+    const auto x    = x_t.begin();
+    const auto rois = rois_t.begin();
+    const auto ind  = ind_t.begin();
+    // input shape
+    auto x_lens      = x_t.get_shape().lens;
+    auto channel_num = x_lens[1];
+    // input dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    array<index_int, 2> in_dims = {x_lens[2], x_lens[3]};
+    const auto stride   = index.nglobal();
+    auto out_s          = y_t.get_shape();
+    auto roi_column_num = rois_t.get_shape().lens[1];
+    // output dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    const auto& out_lens         = out_s.lens;
+    array<index_int, 2> out_dims = {out_lens[2], out_lens[3]};
+    for(index_int i = index.global; i < out_s.elements(); i += stride)
+    {
+        auto idx = out_s.multi(i);
+        int n    = idx[0];
+        int c    = idx[1];
+        int ph   = idx[2];
+        int pw   = idx[3];
+        const auto offset_rois = rois + (n * roi_column_num);
+        const int batch_ind    = ind[n];
+        array<float, 2> roi_starts = {offset_rois[1] * s.spatial_scale,
+                                      offset_rois[0] * s.spatial_scale};
+        array<float, 2> roi_ends   = {offset_rois[3] * s.spatial_scale,
+                                    offset_rois[2] * s.spatial_scale};
+        array<float, 2> roi_size{};
+        array<float, 2> bin_size{};
+        array<index_int, 2> bin_grid_size{};
+        for(index_int ii = 0; ii < roi_size.size(); ++ii)
+        {
+            roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+            roi_size[ii] = migraphx::max(roi_size[ii], 1.0f);
+            bin_size[ii]      = roi_size[ii] / out_dims[ii];
+            bin_grid_size[ii] = (s.sampling_ratio > 0)
+                                    ? s.sampling_ratio
+                                    : migraphx::ceil(roi_size[ii] / out_dims[ii]);
+        }
+        const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+        if constexpr(s.is_avg_pooling)
+        {
+            y_t[i] = calc_pooling(offset_x,
+                                  roi_starts,
+                                  bin_size,
+                                  {ph, pw},
+                                  bin_grid_size,
+                                  in_dims,
+                                  s.roi_offset,
+                                  avg_pool{});
+        }
+        else
+        {
+            y_t[i] = calc_pooling(offset_x,
+                                  roi_starts,
+                                  bin_size,
+                                  {ph, pw},
+                                  bin_grid_size,
+                                  in_dims,
+                                  s.roi_offset,
+                                  max_pool{});
+        }
+    }
+}
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/scatternd.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/scatternd.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_SCATTERND_HPP
+#define MIGRAPHX_GUARD_KERNELS_SCATTERND_HPP
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+namespace migraphx {
+struct assign_none
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        x = y;
+    }
+};
+struct assign_add
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        x += y;
+    }
+};
+struct assign_mul
+{
+    template <class T, class U>
+    MIGRAPHX_DEVICE_CONSTEXPR void operator()(T& x, U y) const
+    {
+        x *= y;
+    }
+};
+template <class T, class U, class V, class F>
+__device__ void scatternd(const T& indices_t, const U& updates_t, const V& output_t, F f)
+{
+    auto index         = make_index();
+    auto updates_shape = updates_t.get_shape();
+    index.global_stride(updates_shape.elements(), [&](auto i) {
+        auto output_shape = output_t.get_shape();
+        auto indices_shape = indices_t.get_shape();
+        auto k             = indices_shape.lens.back();
+        auto q             = indices_shape.lens.size();
+        auto updates_idx = updates_shape.multi(i);
+        auto indices_idx = indices_shape.multi(0);
+        copy(updates_idx.begin(), updates_idx.begin() + q - 1, indices_idx.begin());
+        auto index_start = indices_t.begin() + indices_shape.index(indices_idx);
+        auto index_end   = index_start + k;
+        auto out_idx     = output_shape.multi(0);
+        copy(index_start, index_end, out_idx.begin());
+        copy(updates_idx.begin() + q - 1, updates_idx.end(), out_idx.begin() + k);
+        f(output_t[out_idx], updates_t[i]);
+    });
+}
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -17,35 +17,38 @@ struct shape
    constexpr shape(Lens l, Strides s) : lens(l), strides(s) {}
-    constexpr index_int elements() const { return lens.product(); }
+    constexpr auto elements() const { return _c<Lens{}.product()>; }
-    constexpr index_int element_space() const { return strides.dot(lens - 1) + 1; }
+    constexpr auto element_space() const { return _c<Strides{}.dot(Lens{} - 1) + 1>; }
-    constexpr bool packed() const { return elements() == element_space(); }
+    constexpr auto packed() const { return elements() == element_space(); }
-    constexpr bool broadcasted() const { return strides.product() == 0; }
+    constexpr auto broadcasted() const { return _c<Strides{}.product() == 0>; }
-    constexpr bool transposed() const
+    constexpr auto transposed() const
    {
-        if(broadcasted())
+        return return_c([] {
-        {
+            auto lstrides = Strides{};
-            index_array s;
+            if(shape{}.broadcasted())
-            index_int j = 0;
-            for(index_int i = 0; i < s.size(); i++)
            {
-                if(strides[i] != 0)
+                index_array s{};
+                index_int j = 0;
+                for(index_int i = 0; i < s.size(); i++)
                {
-                    s[j] = strides[i];
+                    if(lstrides[i] != 0)
-                    j++;
+                    {
+                        s[j] = lstrides[i];
+                        j++;
+                    }
                }
+                return not is_sorted(s.begin(), s.begin() + j, greater{});
            }
-            return not is_sorted(s.begin(), s.begin() + j, greater{});
+            else
-        }
+            {
-        else
+                return not is_sorted(lstrides.begin(), lstrides.end(), greater{});
-        {
+            }
-            return not is_sorted(strides.begin(), strides.end(), greater{});
+        });
-        }
    }
-    constexpr bool standard() const { return packed() and not transposed(); }
+    constexpr auto standard() const { return packed() and not transposed(); }
    constexpr index_int index(index_array x) const { return x.dot(strides); }
@@ -63,10 +66,10 @@ struct shape
            return i;
        else
        {
-            const index_int rank = this->lens.size();
+            const auto rank  = this->lens.size();
-            index_int s          = 1;
+            index_int s      = 1;
-            index_int result     = 0;
+            index_int result = 0;
-            for(index_int j = 0; j < this->lens.size(); j++)
+            for(index_int j = 0; j < rank; j++)
            {
                const index_int k      = rank - j - 1;
                const index_int stride = this->strides[k];
@@ -80,11 +83,12 @@ struct shape
        }
    }
+    /// Convert single index into a multi-index
    constexpr index_array multi(index_int idx) const
    {
        index_array result;
        index_int tidx = idx;
-        for(std::ptrdiff_t is = result.size() - 1; is > 0; is--)
+        for(diff_int is = result.size() - 1; is > 0; is--)
        {
            result[is] = tidx % lens[is];
            tidx       = tidx / lens[is];
@@ -92,6 +96,13 @@ struct shape
        result[0] = tidx;
        return result;
    }
+    /// Convert multi-index into a single index
+    constexpr index_int single(index_array idx) const
+    {
+        if(idx.empty())
+            return 0;
+        return inner_product(lens.begin() + 1, lens.end(), idx.begin(), idx.back());
+    }
    constexpr shape get_shape() const { return *this; }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -3,28 +3,62 @@
 #include <migraphx/kernels/shape.hpp>
 #include <migraphx/kernels/debug.hpp>
+#include <migraphx/kernels/iota_iterator.hpp>
 namespace migraphx {
+template <class T>
+struct tensor_view_iterator_read
+{
+    T* view;
+    constexpr auto& operator()(index_int n) const
+    {
+        MIGRAPHX_ASSERT(view != nullptr);
+        return (*view)[n];
+    }
+};
 template <class T, class Shape>
 struct tensor_view
 {
-    using type = T;
+    using type        = T;
+    using shape_type  = Shape;
+    using index_array = typename Shape::index_array;
+    using iterator = basic_iota_iterator<tensor_view_iterator_read<const tensor_view>, index_int>;
    constexpr Shape get_shape() const { return Shape{}; }
-    constexpr index_int size() const { return get_shape().elements(); }
+    constexpr auto size() const { return get_shape().elements(); }
-    template <class U>
+    struct index_to_offset
-    constexpr T& operator[](U i) const
    {
-        MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space());
+        index_int offset;
-        return x[get_shape().index(i)];
+        template <class U>
+        constexpr index_to_offset(U i) : offset(Shape{}.index(i))
+        {
+        }
+    };
+    constexpr T& operator[](MIGRAPHX_CAPTURE_SOURCE_LOCATION(index_to_offset) i) const
+    {
+        index_to_offset ito = i;
+        MIGRAPHX_WARN(ito.offset < get_shape().element_space(),
+                      i,
+                      "Out of bounds access at offset: ",
+                      ito.offset);
+        return x[ito.offset];
    }
    constexpr T* data() const { return x; }
-    constexpr T* begin() const { return data(); }
+    constexpr auto begin() const { return iterator{0, {this}}; }
-    constexpr T* end() const { return data() + size(); }
+    constexpr auto end() const { return iterator{this->size(), {this}}; }
+    constexpr auto begin_at(index_array i) const
+    {
+        MIGRAPHX_ASSERT(get_shape().single(i) < get_shape().elements());
+        MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space());
+        return iterator{get_shape().single(i), {this}};
+    }
    template <class U>
    constexpr tensor_view<U, Shape> with(U* y) const
@@ -36,6 +70,9 @@ struct tensor_view
    T* x;
 };
+template <class T>
+using get_shape_c = typename T::shape_type;
 template <class T, class Shape>
 constexpr tensor_view<T, Shape> make_tensor_view(T* x, Shape)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+namespace migraphx {
+template <class T, class U = T&&>
+U private_declval(int);
+template <class T>
+T private_declval(long);
+template <class T>
+auto declval() noexcept -> decltype(private_declval<T>(0));
+template <class T>
+struct type_identity
+{
+    using type = T;
+};
+template <bool B, class T = void>
+struct enable_if
+{
+};
+template <class T>
+struct enable_if<true, T>
+{
+    using type = T;
+};
+template <bool B, class T = void>
+using enable_if_t = typename enable_if<B, T>::type;
+template <bool B, class T, class F>
+struct conditional
+{
+    using type = T;
+};
+template <class T, class F>
+struct conditional<false, T, F>
+{
+    using type = F;
+};
+template <bool B, class T, class F>
+using conditional_t = typename conditional<B, T, F>::type;
+// NOLINTNEXTLINE
+#define MIGRAPHX_BUILTIN_TYPE_TRAIT1(name)   \
+    template <class T>                       \
+    struct name : bool_constant<__##name(T)> \
+    {                                        \
+    }
+// NOLINTNEXTLINE
+#define MIGRAPHX_BUILTIN_TYPE_TRAIT2(name)      \
+    template <class T, class U>                 \
+    struct name : bool_constant<__##name(T, U)> \
+    {                                           \
+    }
+// NOLINTNEXTLINE
+#define MIGRAPHX_BUILTIN_TYPE_TRAITN(name)       \
+    template <class... Ts>                       \
+    struct name : bool_constant<__##name(Ts...)> \
+    {                                            \
+    }
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_arithmetic);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_destructible);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_nothrow_destructible);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_pointer);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_scalar);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_signed);
+// MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_void);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_abstract);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_aggregate);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_array);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_class);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_compound);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_const);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_empty);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_enum);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_final);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_floating_point);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_function);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_fundamental);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_integral);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_literal_type);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_lvalue_reference);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_member_function_pointer);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_member_object_pointer);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_member_pointer);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_object);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_pod);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_polymorphic);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_reference);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_rvalue_reference);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_standard_layout);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_trivial);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_trivially_copyable);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_trivially_destructible);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_union);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_unsigned);
+MIGRAPHX_BUILTIN_TYPE_TRAIT1(is_volatile);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_assignable);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_base_of);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_convertible);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_nothrow_assignable);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_same);
+MIGRAPHX_BUILTIN_TYPE_TRAIT2(is_trivially_assignable);
+MIGRAPHX_BUILTIN_TYPE_TRAITN(is_constructible);
+MIGRAPHX_BUILTIN_TYPE_TRAITN(is_nothrow_constructible);
+MIGRAPHX_BUILTIN_TYPE_TRAITN(is_trivially_constructible);
+template <class T>
+struct remove_reference
+{
+    using type = T;
+};
+template <class T>
+struct remove_reference<T&>
+{
+    using type = T;
+};
+template <class T>
+struct remove_reference<T&&>
+{
+    using type = T;
+};
+template <class T>
+using remove_reference_t = typename remove_reference<T>::type;
+template <class T>
+struct add_pointer : type_identity<typename remove_reference<T>::type*>
+{
+};
+template <class T>
+using add_pointer_t = typename add_pointer<T>::type;
+template <class... Ts>
+struct common_type;
+template <class T>
+struct common_type<T>
+{
+    using type = T;
+};
+template <class T, class U>
+struct common_type<T, U>
+{
+    using type = decltype(true ? declval<T>() : declval<U>());
+};
+template <class T, class U, class... Us>
+struct common_type<T, U, Us...>
+{
+    using type = typename common_type<typename common_type<T, U>::type, Us...>::type;
+};
+template <class... Ts>
+using common_type_t = typename common_type<Ts...>::type;
+constexpr unsigned long int_max(unsigned long n) { return (1u << (n * 8)) - 1; }
+template <class T>
+constexpr T numeric_max()
+{
+    if constexpr(is_integral<T>{})
+    {
+        if constexpr(is_unsigned<T>{})
+            return int_max(sizeof(T)) * 2;
+        else
+            return int_max(sizeof(T));
+    }
+    else if constexpr(is_same<T, double>{})
+        return __DBL_MAX__;
+    else if constexpr(is_same<T, float>{})
+        return __FLT_MAX__;
+    else if constexpr(is_same<T, migraphx::half>{})
+        return __FLT16_MAX__;
+    else
+        return 0;
+}
+template <class T>
+constexpr T numeric_lowest()
+{
+    if constexpr(is_integral<T>{})
+    {
+        if constexpr(is_unsigned<T>{})
+            return 0;
+        else
+            return -numeric_max<T>() - 1;
+    }
+    else
+    {
+        return -numeric_max<T>();
+    }
+}
+#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
 #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPES_HPP
-#include <hip/hip_runtime.h>
+#include <migraphx/kernels/hip.hpp>
 namespace migraphx {
 using index_int = std::uint32_t;
+using diff_int  = std::int32_t;
 #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
 template <class T, index_int N>
 using vec = T __attribute__((ext_vector_type(N)));
+using half  = _Float16;
+using half2 = migraphx::vec<half, 2>;
 } // namespace migraphx
 #endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -3,6 +3,7 @@
 #include <migraphx/kernels/types.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/functional.hpp>
 namespace migraphx {
@@ -13,7 +14,7 @@ constexpr auto vec_size(vec<T, N>)
 }
 template <class T>
-constexpr auto vec_size(T, ...)
+constexpr auto vec_size(T, ...) // NOLINT
 {
    return index_constant<0>{};
 }
@@ -24,14 +25,140 @@ constexpr auto vec_size()
    return decltype(vec_size(T{})){};
 }
+template <class... Ts>
+constexpr auto is_any_vec()
+{
+    if constexpr(sizeof...(Ts) == 0)
+        return false_type{};
+    else
+        return bool_constant<((vec_size<Ts>() + ...) > 0)>{};
+}
+template <class T, class I>
+constexpr auto vec_at(T x, I i)
+{
+    if constexpr(vec_size<T>() == 0)
+        return x;
+    else
+    {
+        MIGRAPHX_ASSERT(i < vec_size<T>());
+        return x[i];
+    }
+}
+template <class T>
+using vec_type = decltype(vec_at(T{}, 0));
+template <class... Ts>
+constexpr auto common_vec_size()
+{
+    return fold([](auto x, auto y) {
+        if constexpr(x > y)
+            return x;
+        else
+            return y;
+    })(vec_size<Ts>()...);
+}
+// Bools can not be used as a vector type so convert it to uint8
+template <class T>
+__device__ __host__ T* remove_bool(T* x)
+{
+    return x;
+}
+inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
 template <index_int N, class T>
 __device__ __host__ auto as_vec(T* x)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
        return reinterpret_cast<vec<T, N>*>(x);
 }
+template <class T, index_int N>
+using safe_vec = vec<conditional_t<is_same<T, bool>{}, uint8_t, T>, N>;
+template <class... Ts>
+constexpr auto vec_transform(Ts... xs)
+{
+    return [=](auto f) {
+        if constexpr(is_any_vec<Ts...>())
+        {
+            using type                  = decltype(f(vec_at(xs, 0)...));
+            constexpr auto size         = common_vec_size<Ts...>();
+            safe_vec<type, size> result = {0};
+            for(int i = 0; i < size; i++)
+                result[i] = f(vec_at(xs, i)...);
+            return result;
+        }
+        else
+        {
+            return f(xs...);
+        }
+    };
+}
+// Return a vector type of N from index i in another larger vector
+// N will be 2 for half2 packing
+template <index_int N, class T, class I>
+constexpr vec<vec_type<T>, N> vec_packed_at(T x, I i)
+{
+    if constexpr(vec_size<T>() == 0)
+        return vec<T, N>{x};
+    else
+    {
+        MIGRAPHX_ASSERT((i + N) < vec_size<T>());
+        vec<vec_type<T>, N> result = {0};
+        for(int j = 0; j < N; j++)
+        {
+            result[j] = x[i + j];
+        }
+        return result;
+    }
+}
+template <index_int N, class... Ts>
+constexpr auto vec_packed_transform(Ts... xs)
+{
+    return [=](auto f) {
+        if constexpr(is_any_vec<Ts...>())
+        {
+            using type                  = vec_type<decltype(f(vec_packed_at<N>(xs, 0)...))>;
+            constexpr auto size         = common_vec_size<Ts...>();
+            safe_vec<type, size> result = {0};
+            for(int i = 0; i < size / N; i++)
+            {
+                // Call the function with packed vectors
+                safe_vec<type, N> r = f(vec_packed_at<N>(xs, i * N)...);
+                // Copy the packed vectors to the result
+                for(int j = 0; j < N; j++)
+                    result[i * N + j] = r[j];
+            }
+            return result;
+        }
+        else
+        {
+            return f(xs...);
+        }
+    };
+}
+template <class T, class Op>
+constexpr auto vec_reduce(T x, Op op)
+{
+    if constexpr(vec_size<T>() < 2)
+        return x;
+    else
+    {
+        vec_type<T> result = x[0];
+        for(int i = 1; i < vec_size<T>(); i++)
+            result = op(result, x[i]);
+        return result;
+    }
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -7,59 +7,71 @@
 namespace migraphx {
 template <class T>
-constexpr auto tensor_vec_size(T)
+constexpr auto tensor_vec_size()
 {
    return vec_size<typename T::type>();
 }
-template <index_int N, class Shape>
+template <class T>
-constexpr auto as_vec_shape(Shape s)
+constexpr auto tensor_vec_size(T)
 {
-    auto lens    = transform(s.lens, s.strides, [](auto len, auto stride) {
+    return tensor_vec_size<T>();
-        if(stride == 1)
+}
-            return len / N;
-        else
+template <index_int N, class Shape, class Axis>
-            return len;
+constexpr auto shape_step(Shape s, Axis)
-    });
+{
-    auto strides = transform(s.strides, [](auto stride) {
+    static_assert(N > 0, "Vector size must be non-zero");
-        if(stride == 1)
+    return sequence(s.lens.size(), [&](auto... is) {
-            return stride;
+        auto lens    = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
-        return stride / N;
+            constexpr auto axis = Axis::to();
+            MIGRAPHX_ASSERT(i != 0);
+            MIGRAPHX_ASSERT(j != axis or i % N == 0);
+            if(j == axis)
+                return i / N;
+            else
+                return i;
+        });
+        auto strides = transform(s.strides, index_ints<is...>{}, [&](auto i, auto j) {
+            constexpr auto axis = Axis::to();
+            // If stride of the axis is zero then we dont need to adjust the other strides
+            if(Shape{}.strides[axis] == 0)
+                return i;
+            MIGRAPHX_ASSERT(j == axis or i % N == 0);
+            if(j == axis)
+                return i;
+            else
+                return i / N;
+        });
+        MIGRAPHX_ASSERT(make_shape(lens, strides).elements() * N == s.elements());
+        MIGRAPHX_ASSERT(strides[Axis{}] == 0 or
+                        make_shape(lens, strides).element_space() * N == s.element_space());
+        return make_shape(lens, strides);
    });
-    MIGRAPHX_ASSERT(make_shape(lens, strides).element_space() * N == s.element_space());
-    return make_shape(lens, strides);
 }
-template <index_int N, class T>
+template <index_int N, class T, class Axis>
-__device__ __host__ auto as_vec(T x)
+__device__ __host__ auto as_vec(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
-        return make_tensor_view(as_vec<N>(x.data()), as_vec_shape<N>(x.get_shape()));
+        return make_tensor_view(as_vec<N>(remove_bool(x.data())),
+                                shape_step<N>(x.get_shape(), axis));
 }
 template <index_int N, class T, class Axis>
-constexpr auto tensor_step(T x, Axis)
+constexpr auto tensor_step(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
    {
        return x;
    }
    else
    {
        constexpr auto s = decltype(x.get_shape()){};
-        MIGRAPHX_ASSERT(s.strides[Axis{}] == 0);
+        MIGRAPHX_ASSERT(s.strides[axis] == 0);
-        return sequence(x.get_shape().lens.size(), [&](auto... is) {
+        return make_tensor_view(x.data(), shape_step<N>(s, axis));
-            auto lens = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
-                constexpr auto axis = Axis{};
-                if(j == axis)
-                    return i / N;
-                else
-                    return i;
-            });
-            return make_tensor_view(x.data(), make_shape(lens, s.strides));
-        });
    }
 }
@@ -69,55 +81,85 @@ __device__ __host__ auto as_vec(IntegralConstant ic, T&& x)
    return as_vec<ic>(x);
 }
-template <class... Shapes>
+template <class Shape>
-constexpr index_int find_vector_axis(Shapes... ss)
+constexpr index_int find_vector_axis_c(Shape s)
 {
+    // Find the fastest axis that is not broadcasted
    index_int axis = 0;
-    bool b         = false;
+    for(index_int i = 1; i < s.lens.size(); i++)
+    {
+        if(s.strides[i] == 0)
+            continue;
+        if(s.strides[axis] == 0 or
+           pack_compare(less{}, pack(s.strides[i], s.lens[i]), pack(s.strides[axis], s.lens[axis])))
+            axis = i;
+    }
+    return axis;
+}
+template <class... Shapes>
+constexpr index_int find_vector_axis_c(Shapes... ss)
+{
+    const bool all_broadcasted = (ss.broadcasted() and ...);
+    index_int axis             = 0;
+    bool b                     = false;
    by([&](auto s) {
-        if(s.broadcasted() or b)
+        if(b)
            return;
-        auto it = find(s.strides.begin(), s.strides.end(), 1);
+        // Skip broadcasted shapes if there are shapes not broadcasted
-        if(it == s.strides.end())
+        if(not all_broadcasted and s.broadcasted())
            return;
-        axis = it - s.strides.begin();
+        axis = find_vector_axis_c(s);
-        b    = true;
+        if(s.strides[axis] == 1)
+            b = true;
    })(ss...);
+    if(not b)
+        return -1;
    return axis;
 }
+template <class... Shapes>
+constexpr auto find_vector_axis(Shapes...)
+{
+    return _c<find_vector_axis_c(Shapes{}...)>;
+}
 template <index_int N, class Axis, class... Shapes>
-constexpr auto is_vectorizable(Axis axis, Shapes... ss)
+constexpr auto is_vectorizable_c(Axis axis, Shapes... ss)
 {
-    return (((ss.lens[axis] % N) == 0 and (ss.strides[axis] == 1 or ss.strides[axis] == 0)) and
+    return ((axis < ss.lens.size() and ss.lens[axis] % N == 0 and
+             // Only vectorize broadcasted types with stride 0, since this causes issues in the
+             // preloader
+             ((not ss.broadcasted() and ss.strides[axis] == 1) or ss.strides[axis] == 0)) and
            ...);
 }
-template <index_int N, class... Shapes>
+template <index_int N, class Axis, class... Shapes>
-constexpr bool is_vectorizable(Shapes... ss)
+constexpr auto is_vectorizable(Axis, Shapes...)
 {
-    return (is_vectorizable<N>(ss, find_vector_axis(ss)) and ...);
+    return _c<is_vectorizable_c<N>(Axis::to(), Shapes{}...)>;
 }
 template <class P>
 constexpr auto find_vectorize_size(P pred)
 {
-    if constexpr(pred(_c<4>))
+    if constexpr(decltype(pred(_c<4>)){})
        return _c<4>;
-    else if constexpr(pred(_c<2>))
+    else if constexpr(decltype(pred(_c<2>)){})
        return _c<2>;
    else
-        return _c<0>;
+        return _c<1>;
 }
 template <class T>
-__host__ __device__ auto vectorize(T x)
+__host__ __device__ auto auto_vectorize(T x)
 {
-    if constexpr(vec_size<T>() == 0)
+    if constexpr(tensor_vec_size<T>() == 0)
    {
+        constexpr auto axis = find_vector_axis(x.get_shape());
        constexpr auto n =
-            find_vectorize_size([&](auto i) { return _c<is_vectorizable<i>(x.get_shape())>; });
+            find_vectorize_size([&](auto i) { return is_vectorizable<i>(axis, x.get_shape()); });
-        return as_vec<n>(x);
+        return as_vec<n>(x, axis);
    }
    else
    {
@@ -125,34 +167,73 @@ __host__ __device__ auto vectorize(T x)
    }
 }
+template <class F, class... Ts>
+inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
+{
+    // TODO: Just check there a single axis of 1
+    constexpr bool packed_or_broadcasted =
+        ((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
+    if constexpr(packed_or_broadcasted)
+    {
+        constexpr auto axis = decltype(find_vector_axis(xs.get_shape()...)){};
+        constexpr auto n    = find_vectorize_size(
+            [&](auto i) { return is_vectorizable<i>(axis, xs.get_shape()...); });
+        by(
+            [&](auto x) {
+                constexpr auto s = decltype(x.get_shape()){};
+                if constexpr(axis < s.strides.size())
+                {
+                    MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1);
+                    MIGRAPHX_ASSERT(s.lens[axis] > 0);
+                    MIGRAPHX_ASSERT(n == 1 or s.lens[axis] % n == 0);
+                    if constexpr(s.strides[axis] == 0)
+                        return tensor_step<n>(x, axis);
+                    else
+                        return as_vec<n>(x, axis);
+                }
+                else
+                {
+                    return x;
+                }
+            },
+            f)(xs...);
+    }
+    else
+    {
+        f(xs...);
+    }
+}
 inline __device__ __host__ auto auto_vectorize()
 {
-    return [](auto... xs) {
+    return make_transform([](auto f, auto... xs) { auto_vectorize_impl(f, xs...); });
-        return [=](auto f) {
+}
-            // TODO: Just check there a single axis of 1
-            constexpr bool packed_or_broadcasted =
+template <index_int N, index_int Axis, class T>
-                ((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
+__device__ __host__ auto vectorize_tensor(T x)
-            if constexpr(packed_or_broadcasted)
+{
-            {
+    constexpr auto shape = get_shape_c<T>{};
-                constexpr auto axis = find_vector_axis(xs.get_shape()...);
+    if constexpr(shape.lens[Axis] == 1)
-                constexpr auto n    = find_vectorize_size(
+        return x;
-                    [&](auto i) { return _c<is_vectorizable<i>(axis, xs.get_shape()...)>; });
+    else if constexpr(shape.strides[Axis] == 0)
-                by(
+        return tensor_step<N>(x, _c<Axis>);
-                    [&](auto x) {
+    else
-                        constexpr auto s = x.get_shape();
+        return as_vec<N>(x, _c<Axis>);
-                        if constexpr(s.strides[axis] == 0)
+}
-                            return tensor_step<n>(x, axis);
-                        else
+template <index_int N, index_int Axis>
-                            return as_vec<n>(x);
+__device__ __host__ auto vectorize()
-                    },
+{
-                    f)(xs...);
+    return make_transform([](auto f, auto... xs) {
-            }
+        if constexpr(N < 2)
-            else
+        {
-            {
+            f(xs...);
-                f(xs...);
+        }
-            }
+        else
-        };
+        {
-    };
+            f(vectorize_tensor<N, Axis>(xs)...);
+        }
+    });
 }
 } // namespace migraphx

--- a/src/targets/gpu/loop.cpp
+++ b/src/targets/gpu/loop.cpp
+#include <migraphx/run_loop.hpp>
+#include <migraphx/gpu/loop.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/fill.hpp>
+#include <unordered_map>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_loop::compute_shape(std::vector<shape> inputs, std::vector<module_ref> mods) const
+{
+    auto input_num = (inputs.size() - 2) / 2;
+    inputs.erase(inputs.begin() + input_num, inputs.end());
+    return op.compute_shape(inputs, std::move(mods));
+}
+struct gpu_loop
+{
+    int64_t max_iterations = 0;
+    template <class T>
+    void copy(context& ctx, const argument& src, T& dst) const
+    {
+        argument arg_dst{src.get_shape(), &dst};
+        copy_from_gpu(ctx, src, arg_dst);
+    }
+    template <class T>
+    void copy(context& ctx, T src, const argument& dst) const
+    {
+        argument arg_src{dst.get_shape(), &src};
+        copy_to_gpu(ctx, arg_src, dst);
+    }
+    void append(const std::vector<argument>&, const std::vector<argument>&, int) const {}
+    void set_zero(context& ctx, const std::vector<argument>& concatenated_outputs, int iter) const
+    {
+        if(iter >= max_iterations)
+            return;
+        auto elem_num = max_iterations - iter;
+        for(const auto& out : concatenated_outputs)
+        {
+            auto s    = out.get_shape();
+            auto size = s.bytes() / max_iterations;
+            auto lens = s.lens();
+            lens[0]   = elem_num;
+            shape ss{s.type(), lens};
+            assert(ss.bytes() + iter * size <= out.get_shape().bytes());
+            device::fill(ctx.get_stream().get(), argument(ss, out.data() + iter * size), 0);
+        }
+    }
+    std::unordered_map<std::string, int> get_output_params(const module& m) const
+    {
+        auto get_output_index = [](const std::string& name) {
+            std::string out_prefix = "#output_";
+            auto loc               = name.find(out_prefix);
+            if(loc != std::string::npos)
+            {
+                int index = std::stoi(name.substr(loc + out_prefix.size()));
+                return index;
+            }
+            return -1;
+        };
+        const auto& param_names = m.get_parameter_names();
+        std::unordered_map<std::string, int> result;
+        for(const auto& name : param_names)
+        {
+            auto index = get_output_index(name);
+            if(index == -1)
+                continue;
+            result[name] = index;
+        }
+        return result;
+    }
+};
+argument
+hip_loop::compute(context& ctx,
+                  const shape&,
+                  const std::vector<argument>& args,
+                  const std::vector<module_ref>& mods,
+                  const std::function<std::vector<argument>(
+                      module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
+{
+    return run_loop(gpu_loop{op.max_iterations}, ctx, args, mods, run);
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
+#include <iterator>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/instruction.hpp>
@@ -22,6 +23,7 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/deconvolution.hpp>
+#include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/elu.hpp>
 #include <migraphx/gpu/equal.hpp>
 #include <migraphx/gpu/gemm.hpp>
@@ -37,6 +39,8 @@
 #include <migraphx/gpu/quant_convolution.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/unary_not.hpp>
+#include <migraphx/gpu/where.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/program.hpp>
 #include <utility>
@@ -57,6 +61,7 @@ struct miopen_apply
    std::unordered_map<instruction_ref, std::string> prog_output_names{};
    bool offload_copy   = false;
    bool int8_x4_format = true;
+    bool compute_fp32   = false;
    context& get_context() const
    {
@@ -93,13 +98,22 @@ struct miopen_apply
        }
    }
+    const std::unordered_set<std::string>& get_rocblas_fp32_archs()
+    {
+        static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
+        return supported_archs;
+    }
    void init()
    {
        assert(mod != nullptr);
        assert(pass != nullptr);
 #if ROCBLAS_VERSION_MAJOR >= 2 && ROCBLAS_VERSION_MINOR >= 38
-        auto& ctx = get_context();
+        auto& ctx              = get_context();
+        const auto device_name = trim(split_string(get_device_name(), ':').front());
+        if(contains(get_rocblas_fp32_archs(), device_name))
+            compute_fp32 = true;
        rocblas_gemm_flags flag;
        rocblas_query_int8_layout_flag(ctx.get_stream().get_rocblas(), &flag);
        int8_x4_format = (flag == rocblas_gemm_flags_pack_int8x4);
@@ -149,6 +163,7 @@ struct miopen_apply
        add_generic_op("sub");
        add_generic_op("tan");
        add_generic_op("tanh");
+        add_generic_op("where");
        add_extend_op("abs");
        add_extend_op("argmax");
@@ -161,29 +176,29 @@ struct miopen_apply
        add_extend_op("leaky_relu");
        add_extend_op("logsoftmax");
        add_extend_op("lrn");
+        add_extend_op("multinomial");
+        add_extend_op("nonzero");
        add_extend_op("pad");
        add_extend_op("pooling");
        add_extend_op("prefix_scan_sum");
-        add_extend_op("reduce_max");
-        add_extend_op("reduce_mean");
-        add_extend_op("reduce_min");
-        add_extend_op("reduce_prod");
-        add_extend_op("reduce_sum");
        add_extend_op("reverse");
        add_extend_op("rnn_var_sl_last_output");
        add_extend_op("rnn_var_sl_shift_output");
        add_extend_op("rnn_var_sl_shift_sequence");
-        add_extend_op("scatter");
+        add_extend_op("scatter_none");
        add_extend_op("softmax");
+        add_extend_op("topk");
-        add_gemm_op<op::dot>("dot");
+        add_batch_norm_inference_op();
-        add_gemm_op<op::quant_dot>("quant_dot");
        add_convolution_op();
        add_deconvolution_op();
-        add_quant_convolution_op();
+        add_gemm_op<op::dot>("dot");
-        add_batch_norm_inference_op();
+        add_gemm_op<op::quant_dot>("quant_dot");
-        add_neg_op();
        add_if_op();
+        add_loop_op();
+        add_neg_op();
+        add_nms_op();
+        add_quant_convolution_op();
    }
    void copy_params()
@@ -196,6 +211,10 @@ struct miopen_apply
            if(ins->name() != "@param")
                continue;
+            // parameter no outputs, no need to insert copy to gpu
+            if(ins->outputs().empty())
+                continue;
            auto pos = std::next(ins);
            auto a   = insert_allocation(pos, ins->get_shape());
            auto c   = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
@@ -233,11 +252,28 @@ struct miopen_apply
            {
                check_shape(s, apply_map.at(it->name())(it));
            }
+            else if(has_compiler_for(it->name()))
+            {
+                check_shape(s, insert_precompile_op(it));
+            }
        }
        copy_params();
    }
+    instruction_ref insert_precompile_op(instruction_ref ins)
+    {
+        auto output                       = insert_allocation(ins, ins->get_shape());
+        std::vector<instruction_ref> refs = ins->inputs();
+        refs.push_back(output);
+        return mod->replace_instruction(
+            ins,
+            make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+            refs,
+            ins->module_inputs());
+    }
    instruction_ref insert_allocation(instruction_ref ins, const shape& s, std::string tag = "")
    {
        // Instruction's output is an input of the ret instruction
@@ -294,17 +330,14 @@ struct miopen_apply
        });
    }
-    template <class Op>
+    template <typename Op>
-    void add_gemm_op(std::string name)
+    void add_gemm_op(const std::string& name)
    {
        apply_map.emplace(name, [=](instruction_ref ins) {
-            auto&& op                         = any_cast<Op>(ins->get_operator());
-            auto beta                         = op.beta;
            std::vector<instruction_ref> refs = ins->inputs();
            if(refs.size() == 2)
            {
                auto output = insert_allocation(ins, ins->get_shape());
-                beta        = 0;
                refs.push_back(output);
            }
            else
@@ -323,9 +356,8 @@ struct miopen_apply
                    refs.push_back(refs.back());
                }
            }
            return mod->replace_instruction(
-                ins, rocblas_gemm<Op>{Op{op.alpha, beta}, int8_x4_format}, refs);
+                ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format, compute_fp32}, refs);
        });
    }
@@ -333,8 +365,22 @@ struct miopen_apply
    {
        apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
            auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
-            auto conv = miopen_quant_convolution{op, make_conv(op)};
+            shape ws;
-            auto ws   = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
+            miopen_quant_convolution conv;
+            auto compile_quant_conv_with_format = [&](bool format) {
+                conv = miopen_quant_convolution{op, format, make_conv(op)};
+                ws   = conv.compile(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
+            };
+            try
+            {
+                compile_quant_conv_with_format(int8_x4_format);
+            }
+            catch(migraphx::exception&)
+            {
+                // In case no solver supports the default format, retry using the other format.
+                compile_quant_conv_with_format(!int8_x4_format);
+            }
            auto args      = ins->inputs();
            auto workspace = insert_allocation(ins, ws, "workspace");
@@ -344,6 +390,9 @@ struct miopen_apply
        });
    }
+    // add_generic_op just constructs the operator with no fields whereas add_extend_op copies over
+    // the fields Since it doesn't have fields its default constructed
    void add_generic_op(const std::string& name) { add_generic_op(name, "gpu::" + name); }
    void add_generic_op(const std::string& op_name, const std::string& gpu_name)
@@ -405,7 +454,6 @@ struct miopen_apply
                                            reshapes[2],
                                            reshapes[3],
                                            output);
        });
    }
@@ -422,7 +470,7 @@ struct miopen_apply
        });
    }
-    // replace the if operator with gpu_if operator
+    // add input and output argument for the if operator
    void add_if_op()
    {
        apply_map.emplace("if", [=](instruction_ref ins) {
@@ -461,9 +509,68 @@ struct miopen_apply
            return mod->replace_instruction(ins, ins->get_operator(), inputs, mod_args);
        });
    }
+    // replace the loop operator with gpu_loop operator
+    void add_loop_op()
+    {
+        apply_map.emplace("loop", [=](instruction_ref ins) {
+            std::vector<instruction_ref> inputs = ins->inputs();
+            // copy max_iter from gpu to cpu
+            auto cpu_max_iter =
+                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(0));
+            auto cpu_cond =
+                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(1));
+            auto synced_max_iter =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_max_iter, cpu_cond);
+            inputs.at(0)     = synced_max_iter;
+            inputs.at(1)     = cpu_cond;
+            auto copy_inputs = inputs;
+            std::transform(
+                copy_inputs.begin(), copy_inputs.end(), std::back_inserter(inputs), [&](auto in) {
+                    return mod->insert_instruction(
+                        ins, make_op("hip::allocate", {{"shape", to_value(in->get_shape())}}));
+                });
+            auto mod_args = ins->module_inputs();
+            auto output   = insert_allocation(ins, ins->get_shape());
+            const auto* sub_mod = mod_args.front();
+            auto cond_out       = mod->insert_instruction(
+                ins,
+                make_op("hip::allocate",
+                        {{"shape", to_value(sub_mod->get_output_shapes().front())}}));
+            // add cond and mod outputs to the argument list
+            inputs.push_back(cond_out);
+            inputs.push_back(output);
+            return mod->replace_instruction(
+                ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
+        });
+    }
+    void add_nms_op()
+    {
+        apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto output = insert_allocation(ins, s);
+            std::vector<instruction_ref> cpu_inputs;
+            auto inputs = ins->inputs();
+            std::transform(
+                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+                });
+            cpu_inputs.front() =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
+            auto gpu_out =
+                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
+            return mod->replace_instruction(ins, gpu_out);
+        });
+    }
 };
 void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/multinomial.cpp
+++ b/src/targets/gpu/multinomial.cpp
+#include <migraphx/gpu/multinomial.hpp>
+#include <migraphx/gpu/device/multinomial.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <migraphx/check_shapes.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_multinomial::compute_shape(std::vector<shape> inputs) const
+{
+    check_shapes{inputs, *this}.has(3).only_dims(2).standard();
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+argument
+hip_multinomial::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::multinomial(ctx.get_stream().get(), args.back(), args.front(), args[1]);
+    return args.back();
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/nonzero.cpp
+++ b/src/targets/gpu/nonzero.cpp
+#include <migraphx/gpu/nonzero.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/nonzero.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_nonzero::compute_shape(std::vector<shape> inputs) const
+{
+    return op.compute_shape({inputs.front()});
+}
+argument hip_nonzero::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    return device::nonzero(ctx.get_stream().get(), args.back(), args.front());
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/pack_int8_args.cpp
+++ b/src/targets/gpu/pack_int8_args.cpp
@@ -22,10 +22,10 @@ static instruction_ref pad_ins(module& m, instruction_ref ins, int offset)
    auto pad_k                     = (k + 3) / 4 * 4;
    auto pad_lens                  = lens;
    pad_lens[lens.size() + offset] = pad_k;
-    std::vector<int64_t> pad_dims(lens.size() * 2, 0);
+    auto ret_ins                   = ins;
-    auto ret_ins = ins;
    if(pad_k != k)
    {
+        std::vector<int64_t> pad_dims(lens.size() * 2, 0);
        pad_dims[lens.size() + offset] = pad_k - k;
        shape ps{s.type(), pad_lens};
        auto ins_out =
@@ -55,7 +55,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
            auto t_in  = in0->inputs().front();
            auto p_in  = pad_ins(m, t_in, offset);
            auto dims  = val.at("dims").to_vector<int64_t>();
-            auto r_in  = m.insert_instruction(ins, make_op("transpose", {{"dims", dims}}), p_in);
+            auto r_in =
+                m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
            ret_inputs.push_back(r_in);
        }
        else
@@ -85,7 +86,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
            auto t_in  = in1->inputs().front();
            auto p_in  = pad_ins(m, t_in, offset);
            auto dims  = val.at("dims").to_vector<int64_t>();
-            auto r_in  = m.insert_instruction(ins, make_op("transpose", {{"dims", dims}}), p_in);
+            auto r_in =
+                m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
            ret_inputs.push_back(r_in);
        }
        else
@@ -116,7 +118,7 @@ void pack_int8_args::apply(module& m) const
            assert(val.contains("int8_x4_format"));
            if(not val.at("int8_x4_format").to<bool>())
            {
-                return;
+                continue;
            }
            auto inputs = ins->inputs();
            auto lens   = inputs.at(0)->get_shape().lens();
@@ -154,6 +156,12 @@ void pack_int8_args::apply(module& m) const
        }
        else if(ins->name() == "gpu::quant_convolution")
        {
+            auto val = ins->get_operator().to_value();
+            if(not val.at("int8_x4_format").to<bool>())
+            {
+                continue;
+            }
            auto inputs   = ins->inputs();
            auto packed_x = m.insert_instruction(
                ins,

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
+#include <migraphx/gpu/prefuse_ops.hpp>
+#include <migraphx/match/layernorm.hpp>
+#include <migraphx/make_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace {
+struct find_layernorm
+{
+    auto matcher() const { return match::layernorm(); }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["x"];
+        if(not x_ins->get_shape().standard())
+            x_ins = m.insert_instruction(ins, make_op("contiguous"), x_ins);
+        auto relements = x_ins->get_shape().lens().back();
+        if(relements > 1024 or (relements % 4 != 0 and relements > 256))
+            return;
+        auto a = m.insert_instruction(
+            ins, make_op("hip::allocate", {{"shape", to_value(x_ins->get_shape())}}));
+        m.replace_instruction(ins, make_op("gpu::layernorm"), x_ins, a);
+    }
+};
+struct find_triaddlayernorm
+{
+    auto matcher() const
+    {
+        auto add1 =
+            match::name("add")(match::none_of(match::is_constant()),
+                               match::args(match::any().bind("z1"), match::any().bind("z2")));
+        auto add2 = match::name("add")(match::either_arg(0, 1)(add1, match::any().bind("z3")));
+        return match::layernorm()(match::var("x")(add2));
+    }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["z1"];
+        auto y_ins = r.instructions["z2"];
+        auto z_ins = r.instructions["z3"];
+        for(auto* pins : {&x_ins, &y_ins, &z_ins})
+        {
+            if(not(*pins)->get_shape().standard())
+                *pins = m.insert_instruction(ins, make_op("contiguous"), *pins);
+        }
+        auto relements = x_ins->get_shape().lens().back();
+        if(relements > 1024 or (relements % 4 != 0 and relements > 256))
+            return;
+        auto a = m.insert_instruction(
+            ins, make_op("hip::allocate", {{"shape", to_value(x_ins->get_shape())}}));
+        m.replace_instruction(ins, make_op("gpu::triadd_layernorm"), x_ins, y_ins, z_ins, a);
+    }
+};
+} // namespace
+void prefuse_ops::apply(module& m) const
+{
+    match::find_matches(m, find_triaddlayernorm{}, find_layernorm{});
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/quant_convolution.cpp
+++ b/src/targets/gpu/quant_convolution.cpp
@@ -16,8 +16,8 @@ argument miopen_quant_convolution::compute(context& ctx,
                                           const shape& output_shape,
                                           const std::vector<argument>& args) const
 {
-    auto x_desc = make_tensor(args[0].get_shape(), true);
+    auto x_desc = make_tensor(args[0].get_shape(), int8_x4_format);
-    auto w_desc = make_tensor(args[1].get_shape(), true);
+    auto w_desc = make_tensor(args[1].get_shape(), int8_x4_format);
    auto y_desc = make_tensor(output_shape);
    float alpha = 1;
@@ -49,8 +49,8 @@ shape miopen_quant_convolution::compile(context& ctx,
                                        std::vector<shape> inputs)
 {
    shape workspace_shape{};
-    auto x_desc = make_tensor(inputs[0], true);
+    auto x_desc = make_tensor(inputs[0], int8_x4_format);
-    auto w_desc = make_tensor(inputs[1], true);
+    auto w_desc = make_tensor(inputs[1], int8_x4_format);
    auto y_desc = make_tensor(output_shape);
    std::size_t workspace_size = 0;
@@ -62,8 +62,15 @@ shape miopen_quant_convolution::compile(context& ctx,
                                             &workspace_size);
    workspace_shape = shape{shape::int8_type, {workspace_size}};
-    auto arg_vec4_x = to_gpu(generate_argument(pack_int8_shape(inputs[0])));
+    auto x_shape = inputs[0];
-    auto arg_vec4_w = to_gpu(generate_argument(pack_int8_shape(inputs[1])));
+    auto w_shape = inputs[1];
+    if(int8_x4_format)
+    {
+        x_shape = pack_int8_shape(x_shape);
+        w_shape = pack_int8_shape(w_shape);
+    }
+    auto arg_vec4_x = to_gpu(generate_argument(x_shape));
+    auto arg_vec4_w = to_gpu(generate_argument(w_shape));
    auto y          = allocate_gpu(output_shape);
    auto workspace  = allocate_gpu(workspace_shape);

--- a/src/targets/gpu/schedule_model.cpp
+++ b/src/targets/gpu/schedule_model.cpp
@@ -77,28 +77,28 @@ MIGRAPHX_REGISTER_OP(wait_event)
 MIGRAPHX_REGISTER_OP(set_stream)
 std::size_t schedule_model::concurrency() const { return streams; }
-void schedule_model::sched(module& p, instruction_ref ins, std::size_t n) const
+void schedule_model::sched(module& m, instruction_ref ins, std::size_t n) const
 {
    auto last_stream = std::find_if(std::make_reverse_iterator(ins),
-                                    std::make_reverse_iterator(p.begin()),
+                                    std::make_reverse_iterator(m.begin()),
                                    [&](auto&& i) { return i.name() == "gpu::set_stream"; });
-    if(last_stream != std::make_reverse_iterator(p.begin()))
+    if(last_stream != std::make_reverse_iterator(m.begin()))
    {
        auto&& op = any_cast<set_stream>(last_stream->get_operator());
        // If the same stream was set earlier then skip
        if(op.stream == n)
            return;
    }
-    p.insert_instruction(ins, set_stream{n});
+    m.insert_instruction(ins, set_stream{n});
 }
-void schedule_model::wait(module& p, instruction_ref ins, std::size_t wait_id) const
+void schedule_model::wait(module& m, instruction_ref ins, std::size_t wait_id) const
 {
-    p.insert_instruction(ins, wait_event{wait_id});
+    m.insert_instruction(ins, wait_event{wait_id});
 }
-void schedule_model::record(module& p, instruction_ref ins, std::size_t wait_id) const
+void schedule_model::record(module& m, instruction_ref ins, std::size_t wait_id) const
 {
-    p.insert_instruction(std::next(ins), record_event{wait_id});
+    m.insert_instruction(std::next(ins), record_event{wait_id});
 }
 static std::unordered_map<std::string, std::size_t> create_weight_map()