Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into mi100_opts

7dc6e3ae · Khalique Ahmed · f94d77fc · a275f590 · 7dc6e3ae · 7dc6e3ae
Commit 7dc6e3ae authored Sep 17, 2021 by Khalique Ahmed
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/args.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/args.hpp
@@ -2,57 +2,26 @@
 #define MIGRAPHX_GUARD_KERNELS_ARGS_HPP
 #include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/functional.hpp>
 namespace migraphx {
-template <std::size_t N>
-struct arg
-{
-};
-template <std::size_t...>
-struct seq
-{
-    using type = seq;
-};
-template <class, class>
-struct merge_seq;
-template <std::size_t... Xs, std::size_t... Ys>
-struct merge_seq<seq<Xs...>, seq<Ys...>> : seq<Xs..., (sizeof...(Xs) + Ys)...>
-{
-};
-template <std::size_t N>
-struct gens : merge_seq<typename gens<N / 2>::type, typename gens<N - N / 2>::type>
-{
-};
-template <>
-struct gens<0> : seq<>
-{
-};
-template <>
-struct gens<1> : seq<0>
-{
-};
 // Use template specialization since ADL is broken on hcc
-template <std::size_t>
+template <index_int>
 struct make_tensor;
-template <class F, std::size_t... Ns, class... Ts>
+template <class F, index_int... Ns, class... Ts>
-__device__ auto make_tensors_impl(F f, seq<Ns...>, Ts*... xs)
+__device__ auto make_tensors_impl(F f, detail::seq<Ns...>, Ts*... xs)
 {
-    f(make_tensor<Ns>::apply(xs)...);
+    return f(make_tensor<Ns>::apply(xs)...);
 }
-template <class... Ts>
+inline __device__ auto make_tensors()
-__device__ auto make_tensors(Ts*... xs)
 {
-    return [=](auto f) { make_tensors_impl(f, gens<sizeof...(Ts)>{}, xs...); };
+    return [](auto*... xs) {
+        return [=](auto f) { return make_tensors_impl(f, detail::gens<sizeof...(xs)>{}, xs...); };
+    };
 }
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_ARGS_HPP
\ No newline at end of file
--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -2,7 +2,8 @@
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ARRAY_HPP
 #include <migraphx/kernels/types.hpp>
-#include <type_traits>
+#include <migraphx/kernels/integral_constant.hpp>
+#include <migraphx/kernels/debug.hpp>
 namespace migraphx {
@@ -41,8 +42,16 @@ template <class T, index_int N>
 struct array
 {
    T d[N];
-    constexpr T& operator[](index_int i) { return d[i]; }
+    constexpr T& operator[](index_int i)
-    constexpr const T& operator[](index_int i) const { return d[i]; }
+    {
+        MIGRAPHX_ASSERT(i < N);
+        return d[i];
+    }
+    constexpr const T& operator[](index_int i) const
+    {
+        MIGRAPHX_ASSERT(i < N);
+        return d[i];
+    }
    constexpr T& front() { return d[0]; }
    constexpr const T& front() const { return d[0]; }
@@ -53,7 +62,7 @@ struct array
    constexpr T* data() { return d; }
    constexpr const T* data() const { return d; }
-    constexpr std::integral_constant<index_int, N> size() const { return {}; }
+    constexpr index_constant<N> size() const { return {}; }
    constexpr T* begin() { return d; }
    constexpr const T* begin() const { return d; }
@@ -142,6 +151,18 @@ struct array
        result[0] += overflow;
        return result;
    }
+    template <class Stream>
+    friend constexpr const Stream& operator<<(const Stream& ss, const array& a)
+    {
+        for(index_int i = 0; i < N; i++)
+        {
+            if(i > 0)
+                ss << ", ";
+            ss << a[i];
+        }
+        return ss;
+    }
 };
 template <class T, T... xs>
@@ -151,6 +172,18 @@ struct integral_const_array : array<T, sizeof...(xs)>
    MIGRAPHX_DEVICE_CONSTEXPR integral_const_array() : base_array({xs...}) {}
 };
+template <class T, T... xs, class F>
+constexpr auto transform(integral_const_array<T, xs...>, F f)
+{
+    return integral_const_array<T, f(xs)...>{};
+}
+template <class T, T... xs, class U, U... ys, class F>
+constexpr auto transform(integral_const_array<T, xs...>, integral_const_array<U, ys...>, F f)
+{
+    return integral_const_array<T, f(xs, ys)...>{};
+}
 template <index_int... Ns>
 using index_ints = integral_const_array<index_int, Ns...>;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/debug.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
+#define MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
+#include <hip/hip_runtime.h>
+namespace migraphx {
+inline __host__ __device__ void
+assert_fail(const char* assertion, const char* file, unsigned int line, const char* function)
+{
+    printf("%s:%u: %s: assertion '%s' failed.\n", file, line, function, assertion);
+    abort();
+}
+#ifdef MIGRAPHX_DEBUG
+#define MIGRAPHX_ASSERT(cond)            \
+    ((cond) ? void(0) : [](auto... xs) { \
+        assert_fail(xs...);              \
+    }(#cond, __FILE__, __LINE__, __PRETTY_FUNCTION__))
+#else
+#define MIGRAPHX_ASSERT(cond)
+#endif
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_DEBUG_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
+#define MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
+#include <migraphx/kernels/array.hpp>
+namespace migraphx {
+struct swallow
+{
+    template <class... Ts>
+    constexpr swallow(Ts&&...)
+    {
+    }
+};
+template <index_int>
+using ignore = swallow;
+namespace detail {
+template <class R>
+struct eval_helper
+{
+    R result;
+    template <class F, class... Ts>
+    constexpr eval_helper(const F& f, Ts&&... xs) : result(f(static_cast<Ts>(xs)...))
+    {
+    }
+};
+template <>
+struct eval_helper<void>
+{
+    int result;
+    template <class F, class... Ts>
+    constexpr eval_helper(const F& f, Ts&&... xs) : result((f(static_cast<Ts>(xs)...), 0))
+    {
+    }
+};
+template <index_int...>
+struct seq
+{
+    using type = seq;
+};
+template <class, class>
+struct merge_seq;
+template <index_int... Xs, index_int... Ys>
+struct merge_seq<seq<Xs...>, seq<Ys...>> : seq<Xs..., (sizeof...(Xs) + Ys)...>
+{
+};
+template <index_int N>
+struct gens : merge_seq<typename gens<N / 2>::type, typename gens<N - N / 2>::type>
+{
+};
+template <>
+struct gens<0> : seq<>
+{
+};
+template <>
+struct gens<1> : seq<0>
+{
+};
+template <class F, index_int... Ns>
+constexpr auto sequence_c_impl(F&& f, seq<Ns...>)
+{
+    return f(index_constant<Ns>{}...);
+}
+template <index_int... N>
+constexpr auto args_at(seq<N...>)
+{
+    return [](ignore<N>..., auto x, auto...) { return x; };
+}
+} // namespace detail
+template <class T>
+constexpr auto always(T x)
+{
+    return [=](auto&&...) { return x; };
+}
+template <index_int N, class F>
+constexpr auto sequence_c(F&& f)
+{
+    return detail::sequence_c_impl(f, detail::gens<N>{});
+}
+template <class IntegerConstant, class F>
+constexpr auto sequence(IntegerConstant ic, F&& f)
+{
+    return sequence_c<ic>(f);
+}
+template <class F, class G>
+constexpr auto by(F f, G g)
+{
+    return [=](auto... xs) {
+        return detail::eval_helper<decltype(g(f(xs)...))>{g, f(xs)...}.result;
+    };
+}
+template <class F>
+constexpr auto by(F f)
+{
+    return by([=](auto x) { return (f(x), 0); }, always(0));
+}
+template <class F, class... Ts>
+constexpr void each_args(F f, Ts&&... xs)
+{
+    swallow{(f(std::forward<Ts>(xs)), 0)...};
+}
+template <class F>
+constexpr void each_args(F)
+{
+}
+template <class... Ts>
+auto pack(Ts... xs)
+{
+    return [=](auto f) { return f(xs...); };
+}
+template <index_int N>
+constexpr auto arg_c()
+{
+    return [](auto... xs) { return detail::args_at(detail::gens<N>{})(xs...); };
+}
+template <class IntegralConstant>
+constexpr auto arg(IntegralConstant ic)
+{
+    return arg_c<ic>();
+}
+inline constexpr auto rotate_last()
+{
+    return [](auto... xs) {
+        return [=](auto&& f) {
+            return sequence_c<sizeof...(xs)>([&](auto... is) {
+                constexpr auto size = sizeof...(is);
+                return f(arg_c<(is + size - 1) % size>()(xs...)...);
+            });
+        };
+    };
+}
+template <class F>
+constexpr auto transform_args(F f)
+{
+    return [=](auto... xs) {
+        return [=](auto g) { return f(xs...)([&](auto... ys) { return g(ys...); }); };
+    };
+}
+template <class F, class... Fs>
+constexpr auto transform_args(F f, Fs... fs)
+{
+    return [=](auto... xs) { return transform_args(f)(xs...)(transform_args(fs...)); };
+}
+#define MIGRAPHX_LIFT(...) \
+    ([](auto&&... xs) { return (__VA_ARGS__)(static_cast<decltype(xs)>(xs)...); })
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_FUNCTIONAL_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -12,9 +12,43 @@ struct index
    index_int local  = 0;
    index_int group  = 0;
-    __device__ index_int nglobal() const { return blockDim.x * gridDim.x; } // NOLINT
+    __device__ index_int nglobal() const
+    {
+#ifdef MIGRAPHX_NGLOBAL
+        return MIGRAPHX_NGLOBAL;
+#else
+        return blockDim.x * gridDim.x;
+#endif
+    }
-    __device__ index_int nlocal() const { return blockDim.x; } // NOLINT
+    __device__ index_int nlocal() const
+    {
+#ifdef MIGRAPHX_NLOCAL
+        return MIGRAPHX_NLOCAL;
+#else
+        return blockDim.x;
+#endif
+    }
+    template <class F>
+    __device__ void global_stride(index_int n, F f) const
+    {
+        const auto stride = nglobal();
+        for(index_int i = global; i < n; i += stride)
+        {
+            f(i);
+        }
+    }
+    template <class F>
+    __device__ void local_stride(index_int n, F f) const
+    {
+        const auto stride = nlocal();
+        for(index_int i = local; i < n; i += stride)
+        {
+            f(i);
+        }
+    }
 };
 inline __device__ index make_index()

--- a/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/integral_constant.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
+#define MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
+#include <migraphx/kernels/types.hpp>
+namespace migraphx {
+template <class T, T v>
+struct integral_constant
+{
+    static constexpr T value = v;
+    using value_type         = T;
+    using type               = integral_constant;
+    constexpr operator value_type() const noexcept { return value; }
+    constexpr value_type operator()() const noexcept { return value; }
+};
+#define MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(op)                                \
+    template <class T, T v, class U, U w>                                       \
+    constexpr inline integral_constant<decltype(v op w), (v op w)> operator op( \
+        integral_constant<T, v>, integral_constant<U, w>) noexcept              \
+    {                                                                           \
+        return {};                                                              \
+    }
+#define MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(op)                             \
+    template <class T, T v>                                                 \
+    constexpr inline integral_constant<decltype(op v), (op v)> operator op( \
+        integral_constant<T, v>) noexcept                                   \
+    {                                                                       \
+        return {};                                                          \
+    }
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(+)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(-)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(*)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(/)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(%)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(>>)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(<<)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(>)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(<)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(<=)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(>=)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(==)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(!=)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(&)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP (^)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(|)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(&&)
+MIGRAPHX_INTEGRAL_CONSTANT_BINARY_OP(||)
+MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(!)
+MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(~)
+MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(+)
+MIGRAPHX_INTEGRAL_CONSTANT_UNARY_OP(-)
+template <bool B>
+using bool_constant = integral_constant<bool, B>;
+using true_type  = bool_constant<true>;
+using false_type = bool_constant<false>;
+template <index_int N>
+using index_constant = integral_constant<index_int, N>;
+template <auto v>
+static constexpr auto _c = integral_constant<decltype(v), v>{};
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_INTEGRAL_CONSTANT_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
+#define MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/preload.hpp>
+#include <migraphx/kernels/vectorize.hpp>
+#include <migraphx/kernels/args.hpp>
+namespace migraphx {
+template <class F, class T, class... Ts>
+__device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
+{
+    preload<typename T::type>(idx, xs...)([&](auto... ps) {
+        idx.global_stride(out.get_shape().elements(), [&](auto i) {
+            auto multi_idx = out.get_shape().multi(i);
+            out[multi_idx] = f(ps[multi_idx]...);
+        });
+    });
+}
+template <class F, class... Ts>
+__device__ void pointwise(F f, Ts*... ps)
+{
+    auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize());
+    t(ps...)([&](auto... xs) {
+        auto idx = make_index();
+        pointwise_tensor(idx, f, xs...);
+    });
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_POINTWISE_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/preload.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
+#define MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+namespace migraphx {
+template <class T, class... Shapes>
+constexpr auto traverse_preload(Shapes... ss)
+{
+    return [=](auto f, auto... g) {
+        index_int offset = 0;
+        auto each        = [&](auto x) {
+            constexpr auto s    = decltype(x.get_shape()){};
+            constexpr auto size = _c<s.element_space()>;
+            if constexpr(not s.broadcasted())
+                return f(x, offset, false_type{});
+            else if constexpr((s.elements() - size) < 64)
+                return f(x, offset, false_type{});
+            else
+            {
+                auto pre_offset = offset;
+                offset += size;
+                offset += offset % 4;
+                return f(x, pre_offset, true_type{});
+            }
+        };
+        return by(each, g...)(ss...);
+    };
+}
+template <class T, class... Shapes>
+constexpr index_int compute_preload_size(Shapes...)
+{
+    index_int size = 0;
+    traverse_preload<T>(Shapes{}...)(
+        [&](auto s, auto offset, auto) { size = offset + s.element_space(); });
+    return size;
+}
+template <class F, class T, class... Ts>
+__device__ auto preload_copy(index idx, F f, __shared__ T* buffer, Ts... xs)
+{
+    auto invoke = [&](auto... ys) {
+        __syncthreads();
+        f(ys...);
+    };
+    traverse_preload<T>(xs...)(
+        [&](auto x, auto offset, auto copy) {
+            if constexpr(copy)
+            {
+                auto v = vectorize(x);
+                auto b = as_vec(tensor_vec_size(v), buffer + offset);
+                idx.local_stride(v.get_shape().element_space(),
+                                 [&](auto i) { b[i] = v.data()[i]; });
+                return x.with(buffer + offset);
+            }
+            else
+            {
+                return x;
+            }
+        },
+        invoke);
+}
+template <class T>
+struct remove_vec
+{
+    using type = T;
+};
+template <class T, index_int N>
+struct remove_vec<vec<T, N>>
+{
+    using type = T;
+};
+template <class T, class... Ts>
+__device__ auto preload(index idx, Ts... xs)
+{
+    using type               = typename remove_vec<T>::type;
+    constexpr auto size      = compute_preload_size<type>(xs.get_shape()...);
+    const index_int max_size = 512 * sizeof(type);
+    return [=](auto f) {
+        if constexpr(size > 0 and size < max_size)
+        {
+            __shared__ type buffer[size];
+            preload_copy(idx, f, buffer, xs...);
+        }
+        else
+        {
+            f(xs...);
+        }
+    };
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_PRELOAD_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/print.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_PRINT_HPP
+#define MIGRAPHX_GUARD_KERNELS_PRINT_HPP
+#include <hip/hip_runtime.h>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/functional.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+namespace migraphx {
+template <class F, class G>
+struct on_exit
+{
+    F f;
+    G g;
+    template <class T>
+    __host__ __device__ auto operator()(T x) const
+    {
+        return f(x);
+    }
+    __host__ __device__ ~on_exit() { f(g); }
+};
+template <class PrivateMIGraphXTypeNameProbe>
+constexpr auto print_type_name_probe()
+{
+    constexpr auto name                = __PRETTY_FUNCTION__;
+    constexpr auto size                = sizeof(__PRETTY_FUNCTION__);
+    constexpr auto parameter_name      = "PrivateMIGraphXTypeNameProbe = ";
+    constexpr auto parameter_name_size = sizeof("PrivateMIGraphXTypeNameProbe = ") - 1;
+    constexpr auto begin =
+        search(name, name + size, parameter_name, parameter_name + parameter_name_size);
+    static_assert(begin < name + size, "Type probe not found.");
+    constexpr auto start = begin + parameter_name_size;
+    constexpr auto last  = find_if(start, name + size, [](auto c) { return c == ']' or c == ';'; });
+    return [=](const auto& s) { s.print_string(start, last - start); };
+}
+template <class T>
+struct type_printer
+{
+    template <class Stream>
+    friend constexpr const Stream& operator<<(const Stream& s, type_printer)
+    {
+        print_type_name_probe<T>()(s);
+        return s;
+    }
+};
+template <class T>
+constexpr type_printer<T> type_of()
+{
+    return {};
+}
+template <class T>
+constexpr type_printer<T> type_of(T)
+{
+    return {};
+}
+template <class T>
+constexpr type_printer<typename T::type> sub_type_of()
+{
+    return {};
+}
+template <class T>
+constexpr type_printer<typename T::type> sub_type_of(T)
+{
+    return {};
+}
+template <class F>
+struct basic_printer
+{
+    F f;
+    __host__ __device__ const basic_printer& print_long(long value) const
+    {
+        f([&] { printf("%li", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_ulong(unsigned long value) const
+    {
+        f([&] { printf("%lu", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_char(char value) const
+    {
+        f([&] { printf("%c", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_string(const char* value) const
+    {
+        f([&] { printf("%s", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_string(const char* value, int size) const
+    {
+        f([&] { printf("%.*s", size, value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_double(double value) const
+    {
+        f([&] { printf("%f", value); });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& print_bool(bool value) const
+    {
+        f([&] {
+            if(value)
+                printf("true");
+            else
+                printf("false");
+        });
+        return *this;
+    }
+    __host__ __device__ const basic_printer& operator<<(short value) const
+    {
+        return print_long(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned short value) const
+    {
+        return print_ulong(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(int value) const
+    {
+        return print_long(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned int value) const
+    {
+        return print_ulong(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(long value) const
+    {
+        return print_long(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned long value) const
+    {
+        return print_ulong(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(float value) const
+    {
+        return print_double(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(double value) const
+    {
+        return print_double(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(bool value) const
+    {
+        return print_bool(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(char value) const
+    {
+        return print_char(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(unsigned char value) const
+    {
+        return print_char(value);
+    }
+    __host__ __device__ const basic_printer& operator<<(const char* value) const
+    {
+        return print_string(value);
+    }
+};
+template <class F>
+constexpr basic_printer<F> make_printer(F f)
+{
+    return {f};
+}
+template <class F, class G>
+constexpr basic_printer<on_exit<F, G>> make_printer(F f, G g)
+{
+    return {{f, g}};
+}
+inline __device__ auto cout()
+{
+    return make_printer([](auto f) { f(); });
+}
+inline __device__ auto coutln()
+{
+    return make_printer([](auto f) { f(); }, [] { printf("\n"); });
+}
+template <class F, class... Ts>
+__device__ void print_each(F f, Ts... xs)
+{
+    each_args([&](auto x) { f() << x; }, xs...);
+}
+template <class F, class... Ts>
+__device__ void print_each_once(F f, Ts... xs)
+{
+    auto idx = make_index();
+    if(idx.global == 0)
+        print_each(f, xs...);
+}
+template <class... Ts>
+__device__ void print(Ts... xs)
+{
+    print_each(&cout, xs...);
+}
+template <class... Ts>
+__device__ void print_once(Ts... xs)
+{
+    print_each_once(&cout, xs...);
+}
+template <class... Ts>
+__device__ void println(Ts... xs)
+{
+    print_each(&coutln, xs...);
+}
+template <class... Ts>
+__device__ void println_once(Ts... xs)
+{
+    print_each_once(&coutln, xs...);
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_PRINT_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -19,7 +19,7 @@ struct shape
    constexpr index_int elements() const { return lens.product(); }
-    constexpr index_int element_space() const { return strides.dot(lens - 1); }
+    constexpr index_int element_space() const { return strides.dot(lens - 1) + 1; }
    constexpr bool packed() const { return elements() == element_space(); }
    constexpr bool broadcasted() const { return strides.product() == 0; }
@@ -92,6 +92,15 @@ struct shape
        result[0] = tidx;
        return result;
    }
+    constexpr shape get_shape() const { return *this; }
+    template <class Stream>
+    friend constexpr const Stream& operator<<(const Stream& ss, const shape& s)
+    {
+        ss << "{" << s.lens << "}, {" << s.strides << "}";
+        return ss;
+    }
 };
 template <class Lens, class Strides>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -2,18 +2,22 @@
 #define MIGRAPHX_GUARD_KERNELS_TENSOR_VIEW_HPP
 #include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/debug.hpp>
 namespace migraphx {
 template <class T, class Shape>
 struct tensor_view
 {
+    using type = T;
    constexpr Shape get_shape() const { return Shape{}; }
    constexpr index_int size() const { return get_shape().elements(); }
    template <class U>
    constexpr T& operator[](U i) const
    {
+        MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space());
        return x[get_shape().index(i)];
    }
@@ -22,6 +26,13 @@ struct tensor_view
    constexpr T* begin() const { return data(); }
    constexpr T* end() const { return data() + size(); }
+    template <class U>
+    constexpr tensor_view<U, Shape> with(U* y) const
+    {
+        static_assert(sizeof(T) == sizeof(U), "Not the same size");
+        return {y};
+    }
    T* x;
 };

--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -9,6 +9,9 @@ using index_int = std::uint32_t;
 #define MIGRAPHX_DEVICE_CONSTEXPR constexpr __device__ __host__ // NOLINT
+template <class T, index_int N>
+using vec = T __attribute__((ext_vector_type(N)));
 } // namespace migraphx
 #endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_VEC_HPP
+#define MIGRAPHX_GUARD_KERNELS_VEC_HPP
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+namespace migraphx {
+template <class T, index_int N>
+constexpr auto vec_size(vec<T, N>)
+{
+    return index_constant<N>{};
+}
+template <class T>
+constexpr auto vec_size(T, ...)
+{
+    return index_constant<0>{};
+}
+template <class T>
+constexpr auto vec_size()
+{
+    return decltype(vec_size(T{})){};
+}
+template <index_int N, class T>
+__device__ __host__ auto as_vec(T* x)
+{
+    if constexpr(N == 0)
+        return x;
+    else
+        return reinterpret_cast<vec<T, N>*>(x);
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
+#define MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
+#include <migraphx/kernels/tensor_view.hpp>
+#include <migraphx/kernels/vec.hpp>
+namespace migraphx {
+template <class T>
+constexpr auto tensor_vec_size(T)
+{
+    return vec_size<typename T::type>();
+}
+template <index_int N, class Shape>
+constexpr auto as_vec_shape(Shape s)
+{
+    auto lens    = transform(s.lens, s.strides, [](auto len, auto stride) {
+        if(stride == 1)
+            return len / N;
+        else
+            return len;
+    });
+    auto strides = transform(s.strides, [](auto stride) {
+        if(stride == 1)
+            return stride;
+        return stride / N;
+    });
+    MIGRAPHX_ASSERT(make_shape(lens, strides).element_space() * N == s.element_space());
+    return make_shape(lens, strides);
+}
+template <index_int N, class T>
+__device__ __host__ auto as_vec(T x)
+{
+    if constexpr(N == 0)
+        return x;
+    else
+        return make_tensor_view(as_vec<N>(x.data()), as_vec_shape<N>(x.get_shape()));
+}
+template <index_int N, class T, class Axis>
+constexpr auto tensor_step(T x, Axis)
+{
+    if constexpr(N == 0)
+    {
+        return x;
+    }
+    else
+    {
+        constexpr auto s = decltype(x.get_shape()){};
+        MIGRAPHX_ASSERT(s.strides[Axis{}] == 0);
+        return sequence(x.get_shape().lens.size(), [&](auto... is) {
+            auto lens = transform(s.lens, index_ints<is...>{}, [&](auto i, auto j) {
+                constexpr auto axis = Axis{};
+                if(j == axis)
+                    return i / N;
+                else
+                    return i;
+            });
+            return make_tensor_view(x.data(), make_shape(lens, s.strides));
+        });
+    }
+}
+template <class IntegralConstant, class T>
+__device__ __host__ auto as_vec(IntegralConstant ic, T&& x)
+{
+    return as_vec<ic>(x);
+}
+template <class... Shapes>
+constexpr index_int find_vector_axis(Shapes... ss)
+{
+    index_int axis = 0;
+    bool b         = false;
+    by([&](auto s) {
+        if(s.broadcasted() or b)
+            return;
+        auto it = find(s.strides.begin(), s.strides.end(), 1);
+        if(it == s.strides.end())
+            return;
+        axis = it - s.strides.begin();
+        b    = true;
+    })(ss...);
+    return axis;
+}
+template <index_int N, class Axis, class... Shapes>
+constexpr auto is_vectorizable(Axis axis, Shapes... ss)
+{
+    return (((ss.lens[axis] % N) == 0 and (ss.strides[axis] == 1 or ss.strides[axis] == 0)) and
+            ...);
+}
+template <index_int N, class... Shapes>
+constexpr bool is_vectorizable(Shapes... ss)
+{
+    return (is_vectorizable<N>(ss, find_vector_axis(ss)) and ...);
+}
+template <class P>
+constexpr auto find_vectorize_size(P pred)
+{
+    if constexpr(pred(_c<4>))
+        return _c<4>;
+    else if constexpr(pred(_c<2>))
+        return _c<2>;
+    else
+        return _c<0>;
+}
+template <class T>
+__host__ __device__ auto vectorize(T x)
+{
+    if constexpr(vec_size<T>() == 0)
+    {
+        constexpr auto n =
+            find_vectorize_size([&](auto i) { return _c<is_vectorizable<i>(x.get_shape())>; });
+        return as_vec<n>(x);
+    }
+    else
+    {
+        return x;
+    }
+}
+inline __device__ __host__ auto auto_vectorize()
+{
+    return [](auto... xs) {
+        return [=](auto f) {
+            // TODO: Just check there a single axis of 1
+            constexpr bool packed_or_broadcasted =
+                ((xs.get_shape().packed() or xs.get_shape().broadcasted()) and ...);
+            if constexpr(packed_or_broadcasted)
+            {
+                constexpr auto axis = find_vector_axis(xs.get_shape()...);
+                constexpr auto n    = find_vectorize_size(
+                    [&](auto i) { return _c<is_vectorizable<i>(axis, xs.get_shape()...)>; });
+                by(
+                    [&](auto x) {
+                        constexpr auto s = x.get_shape();
+                        if constexpr(s.strides[axis] == 0)
+                            return tensor_step<n>(x, axis);
+                        else
+                            return as_vec<n>(x);
+                    },
+                    f)(xs...);
+            }
+            else
+            {
+                f(xs...);
+            }
+        };
+    };
+}
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_VECTORIZE_HPP
--- a/src/targets/gpu/loop.cpp
+++ b/src/targets/gpu/loop.cpp
+#include <migraphx/run_loop.hpp>
+#include <migraphx/gpu/loop.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/fill.hpp>
+#include <unordered_map>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_loop::compute_shape(std::vector<shape> inputs, std::vector<module_ref> mods) const
+{
+    auto input_num = (inputs.size() - 2) / 2;
+    inputs.erase(inputs.begin() + input_num, inputs.end());
+    return op.compute_shape(inputs, std::move(mods));
+}
+struct gpu_loop
+{
+    int64_t max_iterations = 0;
+    template <class T>
+    void copy(context& ctx, const argument& src, T& dst) const
+    {
+        argument arg_dst{src.get_shape(), &dst};
+        copy_from_gpu(ctx, src, arg_dst);
+    }
+    template <class T>
+    void copy(context& ctx, T src, const argument& dst) const
+    {
+        argument arg_src{dst.get_shape(), &src};
+        copy_to_gpu(ctx, arg_src, dst);
+    }
+    void append(const std::vector<argument>&, const std::vector<argument>&, int) const {}
+    void set_zero(context& ctx, const std::vector<argument>& concatenated_outputs, int iter) const
+    {
+        if(iter >= max_iterations)
+            return;
+        auto elem_num = max_iterations - iter;
+        for(const auto& out : concatenated_outputs)
+        {
+            auto s    = out.get_shape();
+            auto size = s.bytes() / max_iterations;
+            auto lens = s.lens();
+            lens[0]   = elem_num;
+            shape ss{s.type(), lens};
+            assert(ss.bytes() + iter * size <= out.get_shape().bytes());
+            device::fill(ctx.get_stream().get(), argument(ss, out.data() + iter * size), 0);
+        }
+    }
+    std::unordered_map<std::string, int> get_output_params(const module& m) const
+    {
+        auto get_output_index = [](const std::string& name) {
+            std::string out_prefix = "#output_";
+            auto loc               = name.find(out_prefix);
+            if(loc != std::string::npos)
+            {
+                int index = std::stoi(name.substr(loc + out_prefix.size()));
+                return index;
+            }
+            return -1;
+        };
+        const auto& param_names = m.get_parameter_names();
+        std::unordered_map<std::string, int> result;
+        for(const auto& name : param_names)
+        {
+            auto index = get_output_index(name);
+            if(index == -1)
+                continue;
+            result[name] = index;
+        }
+        return result;
+    }
+};
+argument
+hip_loop::compute(context& ctx,
+                  const shape&,
+                  const std::vector<argument>& args,
+                  const std::vector<module_ref>& mods,
+                  const std::function<std::vector<argument>(
+                      module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
+{
+    return run_loop(gpu_loop{op.max_iterations}, ctx, args, mods, run);
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
+#include <iterator>
 #include <migraphx/gpu/lowering.hpp>
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/instruction.hpp>
@@ -37,6 +38,7 @@
 #include <migraphx/gpu/quant_convolution.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/unary_not.hpp>
+#include <migraphx/gpu/where.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/program.hpp>
 #include <utility>
@@ -149,6 +151,7 @@ struct miopen_apply
        add_generic_op("sub");
        add_generic_op("tan");
        add_generic_op("tanh");
+        add_generic_op("where");
        add_extend_op("abs");
        add_extend_op("argmax");
@@ -175,6 +178,7 @@ struct miopen_apply
        add_extend_op("rnn_var_sl_shift_sequence");
        add_extend_op("scatter");
        add_extend_op("softmax");
+        add_extend_op("topk");
        add_gemm_op<op::dot>("dot");
        add_gemm_op<op::quant_dot>("quant_dot");
@@ -184,6 +188,7 @@ struct miopen_apply
        add_batch_norm_inference_op();
        add_neg_op();
        add_if_op();
+        add_loop_op();
    }
    void copy_params()
@@ -196,6 +201,10 @@ struct miopen_apply
            if(ins->name() != "@param")
                continue;
+            // parameter no outputs, no need to insert copy to gpu
+            if(ins->outputs().empty())
+                continue;
            auto pos = std::next(ins);
            auto a   = insert_allocation(pos, ins->get_shape());
            auto c   = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
@@ -422,7 +431,7 @@ struct miopen_apply
        });
    }
-    // replace the if operator with gpu_if operator
+    // add input and output argument for the if operator
    void add_if_op()
    {
        apply_map.emplace("if", [=](instruction_ref ins) {
@@ -461,9 +470,48 @@ struct miopen_apply
            return mod->replace_instruction(ins, ins->get_operator(), inputs, mod_args);
        });
    }
+    // replace the loop operator with gpu_loop operator
+    void add_loop_op()
+    {
+        apply_map.emplace("loop", [=](instruction_ref ins) {
+            std::vector<instruction_ref> inputs = ins->inputs();
+            // copy max_iter from gpu to cpu
+            auto cpu_max_iter =
+                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(0));
+            auto cpu_cond =
+                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.at(1));
+            auto synced_max_iter =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_max_iter, cpu_cond);
+            inputs.at(0)     = synced_max_iter;
+            inputs.at(1)     = cpu_cond;
+            auto copy_inputs = inputs;
+            std::transform(
+                copy_inputs.begin(), copy_inputs.end(), std::back_inserter(inputs), [&](auto in) {
+                    return mod->insert_instruction(
+                        ins, make_op("hip::allocate", {{"shape", to_value(in->get_shape())}}));
+                });
+            auto mod_args = ins->module_inputs();
+            auto output   = insert_allocation(ins, ins->get_shape());
+            const auto* sub_mod = mod_args.front();
+            auto cond_out       = mod->insert_instruction(
+                ins,
+                make_op("hip::allocate",
+                        {{"shape", to_value(sub_mod->get_output_shapes().front())}}));
+            // add cond and mod outputs to the argument list
+            inputs.push_back(cond_out);
+            inputs.push_back(output);
+            return mod->replace_instruction(
+                ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
+        });
+    }
 };
 void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/pack_int8_args.cpp
+++ b/src/targets/gpu/pack_int8_args.cpp
@@ -55,7 +55,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
            auto t_in  = in0->inputs().front();
            auto p_in  = pad_ins(m, t_in, offset);
            auto dims  = val.at("dims").to_vector<int64_t>();
-            auto r_in  = m.insert_instruction(ins, make_op("transpose", {{"dims", dims}}), p_in);
+            auto r_in =
+                m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
            ret_inputs.push_back(r_in);
        }
        else
@@ -85,7 +86,8 @@ static std::vector<instruction_ref> pad_inputs(module& m, instruction_ref ins)
            auto t_in  = in1->inputs().front();
            auto p_in  = pad_ins(m, t_in, offset);
            auto dims  = val.at("dims").to_vector<int64_t>();
-            auto r_in  = m.insert_instruction(ins, make_op("transpose", {{"dims", dims}}), p_in);
+            auto r_in =
+                m.insert_instruction(ins, make_op("transpose", {{"permutation", dims}}), p_in);
            ret_inputs.push_back(r_in);
        }
        else

--- a/src/targets/gpu/quant_convolution.cpp
+++ b/src/targets/gpu/quant_convolution.cpp
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -24,6 +24,7 @@
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/schedule.hpp>
 #include <migraphx/simplify_algebra.hpp>
+#include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/gpu/allocation_model.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
@@ -60,6 +61,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        normalize_ops{},
        decompose{},
        dead_code_elimination{},
+        simplify_qdq{},
        rewrite_quantization{},
        dead_code_elimination{},
        eliminate_data_type{unsupported_types, shape::type_t::float_type},
@@ -91,10 +93,10 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        eliminate_concat{concat_gpu_optimization{}},
        dead_code_elimination{},
-        adjust_allocation{gpu_allocation_model{}},
-        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},
+        adjust_allocation{gpu_allocation_model{}},
+        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
        write_literals{&ctx},

--- a/src/targets/gpu/topk.cpp
+++ b/src/targets/gpu/topk.cpp
+#include <migraphx/gpu/topk.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/topk.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_topk::compute_shape(std::vector<shape> inputs) const
+{
+    return op.normalize_compute_shape({inputs.front()});
+}
+argument hip_topk::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    auto outputs = args.back().get_sub_objects();
+    return op.largest ? device::topk_largest(ctx.get_stream().get(),
+                                             outputs.front(),
+                                             outputs.back(),
+                                             args[0],
+                                             op.k,
+                                             op.axis)
+                      : device::topk_smallest(ctx.get_stream().get(),
+                                              outputs.front(),
+                                              outputs.back(),
+                                              args[0],
+                                              op.k,
+                                              op.axis);
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx