Merge branch 'batch_report' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into mi100_opts

0369e974 · Khalique Ahmed · 3a474fca · d70fd0df · 0369e974 · 0369e974
Commit 0369e974 authored Nov 10, 2021 by Khalique Ahmed
20 changed files
--- a/src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/float_equal.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_FLOAT_EQUAL_HPP
+#include <migraphx/requires.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+template <class... Ts>
+using common_type = typename std::common_type<Ts...>::type;
+template <class T, MIGRAPHX_REQUIRES(is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return std::isfinite(x) and std::isfinite(y) and
+           std::nextafter(x, std::numeric_limits<T>::lowest()) <= y and
+           std::nextafter(x, std::numeric_limits<T>::max()) >= y;
+}
+template <class T, MIGRAPHX_REQUIRES(not is_floating_point<T>{})>
+__device__ bool float_equal_device(T x, T y)
+{
+    return x == y;
+}
+template <class T, class U>
+__device__ bool float_equal(T x, U y)
+{
+    return float_equal_device<common_type<T, U>>(x, y);
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -129,6 +129,21 @@ __device__ __host__ T to_hip_type(T x)
 // Hip doens't support __fp16
 inline __device__ __host__ float to_hip_type(gpu_half x) { return x; }
+#define MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(trait, T) \
+    template <class X>                             \
+    struct trait : std::trait<X>                   \
+    {                                              \
+    };                                             \
+                                                   \
+    template <>                                    \
+    struct trait<T> : std::true_type               \
+    {                                              \
+    };
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_floating_point, __fp16)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_signed, __fp16)
+MIGRAPHX_DETAIL_EXTEND_TRAIT_FOR(is_arithmetic, __fp16)
 } // namespace device
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/device/nonzero.cpp
+++ b/src/targets/gpu/device/nonzero.cpp
+#include <migraphx/gpu/device/nonzero.hpp>
+#include <migraphx/gpu/device/float_equal.hpp>
+#include <migraphx/gpu/device/scan.hpp>
+#include <migraphx/gpu/device/reduce_ops.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data)
+{
+    auto s            = arg_data.get_shape();
+    auto elem_num     = s.elements();
+    auto out_elem_num = result.get_shape().elements();
+    // call the prefix_sum function to do a prefix_sum to compute
+    // index in the output. Only 1 block can be used since we have
+    // only one prefix sum
+    const index_int block_size = 256;
+    hip_visit_all(arg_data, s)([&](auto input, auto si) {
+        const auto* in_ptr = device_cast(input.data());
+        auto* ptr          = result.cast<int64_t>();
+        gs_launch(stream, block_size, block_size)([=](auto, auto idx) __device__ {
+            // fill all output to 0 first
+            idx.local_stride(out_elem_num, [&](auto j) { ptr[j] = 0; });
+            block_scan<block_size>(idx,
+                                   sum{},
+                                   0,
+                                   elem_num,
+                                   [&](auto j) { return (float_equal(in_ptr[j], 0)) ? 0 : 1; },
+                                   [&](auto j, auto x) {
+                                       auto out_loc = x - 1;
+                                       if(float_equal(in_ptr[j], 0))
+                                           return;
+                                       auto index = si.multi(j);
+                                       for(size_t k = 0; k < index.size(); ++k)
+                                       {
+                                           ptr[k * elem_num + out_loc] = index[k];
+                                       }
+                                   });
+        });
+    });
+    return result;
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -5,6 +5,7 @@
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/clip.hpp>
 #include <migraphx/gpu/convolution.hpp>
+#include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/oper.hpp>
 #include <migraphx/gpu/add.hpp>
 #include <migraphx/gpu/mul.hpp>
@@ -26,6 +27,7 @@
 #include <migraphx/array.hpp>
 #include <migraphx/op/clip.hpp>
 #include <cmath>
+#include <set>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -152,6 +154,12 @@ struct fusion
    }
 };
+const std::unordered_set<std::string>& get_supported_archs()
+{
+    static std::unordered_set<std::string> supported_archs{"gfx900", "gfx906", "gfx908", "gfx1030"};
+    return supported_archs;
+}
 MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
 {
    auto&& s = ins->get_shape();
@@ -161,6 +169,9 @@ MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
 MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
 {
+    const auto device_name = split_string(get_device_name(), ':').front();
+    if(not contains(get_supported_archs(), device_name))
+        return false;
    if(enabled(MIGRAPHX_DISABLE_MIOPEN_FUSION{}))
        return false;
    if(ins->name() != "gpu::convolution")
@@ -717,7 +728,7 @@ struct find_gemm_add
        auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());
        // Already fused gemm
-        if(not float_equal(gemm.op.beta, 0))
+        if(not float_equal(gemm.beta, 0))
            return;
        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto i) {
@@ -738,7 +749,7 @@ struct find_gemm_add
        inputs.push_back(copy_ins);
        inputs.push_back(copy_ins);
-        gemm.op.beta = 1;
+        gemm.beta = 1;
        p.replace_instruction(ins, gemm, inputs);
    }
 };

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -15,6 +15,10 @@ namespace gpu {
 std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);
+std::string enum_params(std::size_t count, std::string param);
+std::size_t compute_global(std::size_t n, std::size_t local = 1024);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/operation.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+operation compile_roialign(context& ctx, const std::vector<shape>& io_shapes, const value& val);
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
--- a/src/include/migraphx/remap.hpp
+++ b/src/include/migraphx/remap.hpp
-#ifndef MIGRAPHX_GUARD_RTGLIB_REMAP_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
-#define MIGRAPHX_GUARD_RTGLIB_REMAP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_NONZERO_HPP
-#include <string>
+#include <migraphx/argument.hpp>
-#include <migraphx/instruction_ref.hpp>
 #include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
-struct module;
+argument nonzero(hipStream_t stream, const argument& result, const argument& arg_data);
-/**
- * Decompose operators.
- */
-struct remap
-{
-    std::string name() const { return "remap"; }
-    void apply(module& p) const;
-};
+} // namespace device
+} // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP
 #define MIGRAPHX_GUARD_RTGLIB_GPU_GEMM_HPP
+#include <migraphx/errors.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/value.hpp>
 #include <migraphx/shape.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -19,13 +22,17 @@ template <class Op>
 struct rocblas_gemm
 {
    Op op;
+    float alpha         = 1;
+    float beta          = 0;
    bool int8_x4_format = true;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack_join(migraphx::reflect(self.op, f),
-                         pack(f(self.int8_x4_format, "int8_x4_format")));
+                         pack(f(self.alpha, "alpha"),
+                              f(self.beta, "beta"),
+                              f(self.int8_x4_format, "int8_x4_format")));
    }
    std::string name() const
@@ -44,6 +51,26 @@ struct rocblas_gemm
        check_shapes{in_shapes, *this}.not_broadcasted();
        batch_not_transposed(inputs[0].strides());
        batch_not_transposed(inputs[1].strides());
+        // if gemm and add are fused
+        if(not float_equal(beta, 0))
+        {
+            auto cmat_shape = in_shapes.back();
+            in_shapes.pop_back();
+            auto op_out_shape = op.compute_shape(in_shapes);
+            if(cmat_shape.lens() != op_out_shape.lens())
+            {
+                MIGRAPHX_THROW(this->name() + " : dimension mismatch, operand C: {" +
+                               to_string_range(cmat_shape.lens()) +
+                               "}, cannot add to operand A * B: {" +
+                               to_string_range(op_out_shape.lens()) + "}");
+            }
+            if(cmat_shape.type() != op_out_shape.type())
+            {
+                MIGRAPHX_THROW(this->name() + " : operand C type mismatch, operand C is of type: " +
+                               to_string(cmat_shape.type()) +
+                               ", it must be: " + to_string(op_out_shape.type()));
+            }
+        }
        return op.compute_shape(in_shapes);
    }
@@ -51,7 +78,14 @@ struct rocblas_gemm
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
-        gemm(ctx, output_shape, args, op.alpha, op.beta, int8_x4_format);
+        if(this->name() == "gpu::gemm")
+        {
+            gemm(ctx, output_shape, args, alpha, beta, int8_x4_format);
+        }
+        else
+        {
+            gemm(ctx, output_shape, args, int32_t(alpha), int32_t(beta), int8_x4_format);
+        }
        return args.back();
    }

--- a/src/targets/gpu/include/migraphx/gpu/nonzero.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/nonzero.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_NONZERO_HPP
+#define MIGRAPHX_GUARD_RTGLIB_NONZERO_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/reflect.hpp>
+#include <migraphx/op/nonzero.hpp>
+#include <migraphx/gpu/miopen.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+struct hip_nonzero
+{
+    op::nonzero op;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+    std::string name() const { return "gpu::nonzero"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -2,40 +2,51 @@
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ARRAY_HPP
 #include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/debug.hpp>
 namespace migraphx {
 // NOLINTNEXTLINE
-#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                               \
+#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                    \
-    constexpr array& operator op(const array& x)                              \
+    template <class U>                                                             \
-    {                                                                         \
+    constexpr array& operator op(const array<U, N>& x)                             \
-        for(index_int i = 0; i < N; i++)                                      \
+    {                                                                              \
-            d[i] op x[i];                                                     \
+        for(index_int i = 0; i < N; i++)                                           \
-        return *this;                                                         \
+            d[i] op x[i];                                                          \
-    }                                                                         \
+        return *this;                                                              \
-    constexpr array& operator op(const T& x)                                  \
+    }                                                                              \
-    {                                                                         \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
-        for(index_int i = 0; i < N; i++)                                      \
+    constexpr array& operator op(const U& x)                                       \
-            d[i] op x;                                                        \
+    {                                                                              \
-        return *this;                                                         \
+        for(index_int i = 0; i < N; i++)                                           \
-    }                                                                         \
+            d[i] op x;                                                             \
-    friend constexpr array operator binary_op(const array& x, const array& y) \
+        return *this;                                                              \
-    {                                                                         \
+    }                                                                              \
-        auto z = x;                                                           \
+    template <class U>                                                             \
-        return z op y;                                                        \
+    friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
-    }                                                                         \
+    {                                                                              \
-    friend constexpr array operator binary_op(const array& x, const T& y)     \
+        array<decltype(T {} binary_op U{}), N> z{};                                \
-    {                                                                         \
+        for(index_int i = 0; i < N; i++)                                           \
-        auto z = x;                                                           \
+            z[i] = x[i] binary_op y[i];                                            \
-        return z op y;                                                        \
+        return z;                                                                  \
-    }                                                                         \
+    }                                                                              \
-    friend constexpr array operator binary_op(const T& x, const array& y)     \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
-    {                                                                         \
+    friend constexpr auto operator binary_op(const array& x, const U& y)           \
-        for(index_int i = 0; i < N; i++)                                      \
+    {                                                                              \
-            y[i] = x op y[i];                                                 \
+        array<decltype(T {} binary_op U{}), N> z{};                                \
-        return y;                                                             \
+        for(index_int i = 0; i < N; i++)                                           \
+            z[i] = x[i] binary_op y;                                               \
+        return z;                                                                  \
+    }                                                                              \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
+    friend constexpr auto operator binary_op(const U& x, const array& y)           \
+    {                                                                              \
+        array<decltype(T {} binary_op U{}), N> z{};                                \
+        for(index_int i = 0; i < N; i++)                                           \
+            z[i] = x binary_op y[i];                                               \
+        return z;                                                                  \
    }
 template <class T, index_int N>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/basic_ops.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/basic_ops.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
+#include <migraphx/kernels/types.hpp>
+namespace migraphx {
+struct sum
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return x + y;
+    }
+};
+struct product
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return x * y;
+    }
+};
+struct id
+{
+    template <class T>
+    constexpr auto operator()(T x) const
+    {
+        return x;
+    }
+};
+struct mean
+{
+    size_t item_num = 1;
+    template <class T>
+    constexpr auto operator()(T x) const
+    {
+        return x / static_cast<T>(item_num);
+    }
+};
+struct max_f
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return (x > y) ? x : y;
+    }
+};
+inline constexpr auto max = max_f{};
+struct min_f
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return (x < y) ? x : y;
+    }
+};
+inline constexpr auto min = min_f{};
+struct lowest
+{
+    template <class T>
+    constexpr operator T() const
+    {
+        return std::numeric_limits<T>::lowest();
+    }
+};
+struct highest
+{
+    template <class T>
+    constexpr operator T() const
+    {
+        return std::numeric_limits<T>::max();
+    }
+};
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/dfor.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/dfor.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
+namespace migraphx {
+// Multidimensional for loop
+inline constexpr auto dfor()
+{
+    return [](auto f) { f(); };
+}
+template <class T, class... Ts>
+constexpr auto dfor(T x, Ts... xs)
+{
+    return [=](auto f) {
+        for(T i = 0; i < x; i++)
+        {
+            dfor(xs...)([&](Ts... is) { f(i, is...); });
+        }
+    };
+}
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/dfor.hpp>
+#include <migraphx/kernels/basic_ops.hpp>
+#include <args.hpp>
+namespace migraphx {
+struct max_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return lowest(); }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return max(x, y);
+    }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t)
+    {
+        return (x);
+    }
+};
+struct avg_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return 0.0; }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return x + y;
+    }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t y)
+    {
+        return (y == 0) ? 0.0 : (x / y);
+    }
+};
+template <class T, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR T bilinear_interpolate(const T* data,
+                                                 const array<std::size_t, 2>& dims,
+                                                 array<float, 2> xy,
+                                                 Op pooling)
+{
+    array<int, 2> low{};
+    array<int, 2> high{};
+    for(std::size_t ii = 0; ii < xy.size(); ++ii)
+    {
+        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
+        {
+            return 0;
+        }
+        xy[ii]   = max(xy[ii], 0.0f);
+        low[ii]  = xy[ii];
+        high[ii] = low[ii] + 1;
+        if(low[ii] >= dims[ii] - 1)
+        {
+            xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+        }
+    }
+    array<std::size_t, 4> locs = {low[0] * dims[1] + low[1],
+                                  low[0] * dims[1] + high[1],
+                                  high[0] * dims[1] + low[1],
+                                  high[0] * dims[1] + high[1]};
+    float ly       = xy[0] - low[0];
+    float lx       = xy[1] - low[1];
+    float hy       = 1.0f - ly;
+    float hx       = 1.0f - lx;
+    array<T, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
+    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
+    return pooling(v01, v23);
+}
+template <class T, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR T calc_pooling(const T*& data,
+                                         const array<float, 2>& roi_starts,
+                                         const array<float, 2>& bin_size,
+                                         const array<int, 2>& idx,
+                                         const array<std::size_t, 2>& bin_grid_size,
+                                         const array<std::size_t, 2>& dims,
+                                         float roi_offset,
+                                         Op op)
+{
+    T output_val        = op.init();
+    const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
+        array<std::size_t, 2> id = {iy, ix};
+        array<float, 2> locs =
+            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;
+        auto val   = bilinear_interpolate(data, dims, locs, op);
+        output_val = op(output_val, val);
+    });
+    return op.final(output_val, count);
+}
+template <class T, class U, class V, class W>
+__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W& y_t)
+{
+    const float roi_offset       = ROIS_OFFSET;
+    const bool is_avg_pooling    = IS_AVG_POOLING;
+    const int64_t sampling_ratio = SAMPLING_RATIO;
+    const float spatial_scale    = SPATIAL_SCALE;
+    auto index       = make_index();
+    const auto* x    = x_t.data();
+    const auto* rois = rois_t.data();
+    const auto* ind  = ind_t.data();
+    auto* out_ptr = y_t.data();
+    // input shape
+    auto x_lens      = x_t.get_shape().lens;
+    auto channel_num = x_lens[1];
+    // input dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+    const auto stride   = index.nglobal();
+    auto out_s          = y_t.get_shape();
+    auto roi_column_num = rois_t.get_shape().lens[1];
+    // output dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    const auto& out_lens           = out_s.lens;
+    array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+    for(index_int i = index.global; i < out_s.elements(); i += stride)
+    {
+        auto idx = out_s.multi(i);
+        int n    = idx[0];
+        int c    = idx[1];
+        int ph   = idx[2];
+        int pw   = idx[3];
+        const auto* offset_rois = rois + (n * roi_column_num);
+        const int batch_ind     = ind[n];
+        array<float, 2> roi_starts = {offset_rois[1] * spatial_scale,
+                                      offset_rois[0] * spatial_scale};
+        array<float, 2> roi_ends = {offset_rois[3] * spatial_scale, offset_rois[2] * spatial_scale};
+        array<float, 2> roi_size{};
+        array<float, 2> bin_size{};
+        array<std::size_t, 2> bin_grid_size{};
+        for(std::size_t ii = 0; ii < roi_size.size(); ++ii)
+        {
+            roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+            roi_size[ii] = max(roi_size[ii], 1.0f);
+            bin_size[ii] = roi_size[ii] / out_dims[ii];
+            bin_grid_size[ii] =
+                (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_size[ii] / out_dims[ii]);
+        }
+        const auto* offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+        if constexpr(is_avg_pooling)
+        {
+            out_ptr[i] = calc_pooling(offset_x,
+                                      roi_starts,
+                                      bin_size,
+                                      {ph, pw},
+                                      bin_grid_size,
+                                      in_dims,
+                                      roi_offset,
+                                      avg_pool{});
+        }
+        else
+        {
+            out_ptr[i] = calc_pooling(offset_x,
+                                      roi_starts,
+                                      bin_size,
+                                      {ph, pw},
+                                      bin_grid_size,
+                                      in_dims,
+                                      roi_offset,
+                                      max_pool{});
+        }
+    }
+}
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+namespace migraphx {
+template <bool B, class T = void>
+struct enable_if
+{
+};
+template <class T>
+struct enable_if<true, T>
+{
+    using type = T;
+};
+template <bool B, class T = void>
+using enable_if_t = typename enable_if<B, T>::type;
+template <class From, class To>
+struct is_convertible : bool_constant<__is_convertible(From, To)>
+{
+};
+#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -20,6 +20,7 @@
 #include <migraphx/gpu/abs.hpp>
 #include <migraphx/gpu/batch_norm_inference.hpp>
+#include <migraphx/gpu/compile_roialign.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/deconvolution.hpp>
@@ -165,6 +166,7 @@ struct miopen_apply
        add_extend_op("logsoftmax");
        add_extend_op("lrn");
        add_extend_op("multinomial");
+        add_extend_op("nonzero");
        add_extend_op("pad");
        add_extend_op("pooling");
        add_extend_op("prefix_scan_sum");
@@ -181,15 +183,17 @@ struct miopen_apply
        add_extend_op("softmax");
        add_extend_op("topk");
-        add_gemm_op<op::dot>("dot");
+        add_batch_norm_inference_op();
-        add_gemm_op<op::quant_dot>("quant_dot");
        add_convolution_op();
        add_deconvolution_op();
-        add_quant_convolution_op();
+        add_gemm_op<op::dot>("dot");
-        add_batch_norm_inference_op();
+        add_gemm_op<op::quant_dot>("quant_dot");
-        add_neg_op();
        add_if_op();
        add_loop_op();
+        add_neg_op();
+        add_nms_op();
+        add_quant_convolution_op();
+        add_roialign();
    }
    void copy_params()
@@ -304,17 +308,14 @@ struct miopen_apply
        });
    }
-    template <class Op>
+    template <typename Op>
-    void add_gemm_op(std::string name)
+    void add_gemm_op(const std::string& name)
    {
        apply_map.emplace(name, [=](instruction_ref ins) {
-            auto&& op                         = any_cast<Op>(ins->get_operator());
-            auto beta                         = op.beta;
            std::vector<instruction_ref> refs = ins->inputs();
            if(refs.size() == 2)
            {
                auto output = insert_allocation(ins, ins->get_shape());
-                beta        = 0;
                refs.push_back(output);
            }
            else
@@ -333,9 +334,8 @@ struct miopen_apply
                    refs.push_back(refs.back());
                }
            }
            return mod->replace_instruction(
-                ins, rocblas_gemm<Op>{Op{op.alpha, beta}, int8_x4_format}, refs);
+                ins, rocblas_gemm<Op>{Op{}, 1, 0, int8_x4_format}, refs);
        });
    }
@@ -472,6 +472,22 @@ struct miopen_apply
        });
    }
+    void add_roialign()
+    {
+        apply_map.emplace("roialign", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto op_val = ins->get_operator().to_value();
+            auto output = insert_allocation(ins, s);
+            auto args   = ins->inputs();
+            args.push_back(output);
+            auto io_shapes = to_shapes(args);
+            auto co        = compile_roialign(get_context(), io_shapes, op_val);
+            return mod->replace_instruction(ins, co, args);
+        });
+    }
    // replace the loop operator with gpu_loop operator
    void add_loop_op()
    {
@@ -509,6 +525,26 @@ struct miopen_apply
                ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
        });
    }
+    void add_nms_op()
+    {
+        apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto output = insert_allocation(ins, s);
+            std::vector<instruction_ref> cpu_inputs;
+            auto inputs = ins->inputs();
+            std::transform(
+                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+                });
+            cpu_inputs.front() =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
+            auto gpu_out =
+                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
+            return mod->replace_instruction(ins, gpu_out);
+        });
+    }
 };
 void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }

--- a/src/targets/gpu/nonzero.cpp
+++ b/src/targets/gpu/nonzero.cpp
+#include <migraphx/gpu/nonzero.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/nonzero.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_nonzero::compute_shape(std::vector<shape> inputs) const
+{
+    return op.compute_shape({inputs.front()});
+}
+argument hip_nonzero::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    return device::nonzero(ctx.get_stream().get(), args.back(), args.front());
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -2,7 +2,6 @@
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/check_context.hpp>
 #include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/decompose.hpp>
 #include <migraphx/eliminate_allocation.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
 #include <migraphx/eliminate_concat.hpp>
@@ -17,7 +16,6 @@
 #include <migraphx/preallocate_param.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/remap.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
@@ -59,7 +57,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
    return
    {
        normalize_ops{},
-        decompose{},
        dead_code_elimination{},
        simplify_qdq{},
        rewrite_quantization{},

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -518,42 +518,12 @@ struct ref_gemm
        return migraphx::reflect(self.op, f);
    }
    std::string name() const { return "ref::dot"; }
-    shape compute_shape(const std::vector<shape>& inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    {
-        if(inputs.size() == 3)
-        {
-            auto c_shape = inputs.at(2);
-            check_shapes{{c_shape}, *this}.not_broadcasted();
-        }
-        return op.compute_shape(inputs);
-    }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        // 3 inputs, it is alpha * A * B + beta * C, then
+        migemm(result, args[0], args[1], 1.0f, 0.0f);
-        // A and B are matrices, and C is of the same shape as A * B
-        if(args.size() == 3)
-        {
-            // no need to consider the value of args[2]
-            if(op.beta == 0.0f)
-            {
-                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
-            }
-            else
-            {
-                visit_all(result, args[2])([&](auto output, auto input) {
-                    std::copy(input.begin(), input.end(), output.begin());
-                });
-            }
-            migemm(result, args[0], args[1], op.alpha, op.beta);
-            return result;
-        }
-        // 2 input arguments
-        migemm(result, args[0], args[1], op.alpha, 0.0f);
        return result;
    }
@@ -571,22 +541,11 @@ struct ref_quant_gemm
    }
    std::string name() const { return "ref::quant_dot"; }
-    shape compute_shape(const std::vector<shape>& inputs) const
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    {
-        if(inputs.size() == 3)
-        {
-            auto c_shape = inputs.at(2);
-            check_shapes{{c_shape}, *this}.not_broadcasted();
-        }
-        return op.compute_shape(inputs);
-    }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        // 3 inputs, it is alpha * A * B + beta * C, then
-        // A and B are matrices, and C is of the same shape to A * B
        // first, convert the args[0] and args[1] from int8_t to int32_t
        argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
        argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
@@ -600,27 +559,7 @@ struct ref_quant_gemm
                [&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
        });
-        if(args.size() == 3)
+        migemm(result, arg_0, arg_1, int32_t{1}, int32_t{0});
-        {
-            // no need to consider the value of args[2]
-            if(op.beta == 0)
-            {
-                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
-            }
-            else
-            {
-                visit_all(result, args[2])([&](auto output, auto input) {
-                    std::copy(input.begin(), input.end(), output.begin());
-                });
-            }
-            migemm(result, arg_0, arg_1, op.alpha, op.beta);
-            return result;
-        }
-        // 2 input arguments
-        migemm(result, arg_0, arg_1, op.alpha, int32_t{0});
        return result;
    }

--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -45,7 +45,7 @@ TEST_CASE(if_pl_test)
        auto ys = param_shapes["y"];
        std::vector<float> yd(ys.bytes() / sizeof(float), 2.0);
        pp.add("y", migraphx::argument(ys, yd.data()));
-        char ccond = static_cast<char>(cond);
+        char ccond = cond;
        pp.add("cond", migraphx::argument(param_shapes["cond"], &ccond));
        auto outputs = p.eval(pp);

--- a/test/api/test_op_construct.cpp
+++ b/test/api/test_op_construct.cpp
@@ -8,16 +8,22 @@ TEST_CASE(add_op)
    EXPECT(add_op.name() == "add");
 }
-TEST_CASE(reduce_mean)
+TEST_CASE(reduce_mean_without_quotes)
 {
    auto rm = migraphx::operation("reduce_mean", "{axes : [1, 2, 3, 4]}");
    EXPECT(rm.name() == "reduce_mean");
 }
-TEST_CASE(reduce_mean1)
+TEST_CASE(reduce_mean)
 {
    auto rm = migraphx::operation("reduce_mean", "{\"axes\" : [1, 2, 3, 4]}");
    EXPECT(rm.name() == "reduce_mean");
 }
+TEST_CASE(reduce_mean_with_format)
+{
+    auto rm = migraphx::operation("reduce_mean", "{axes : [%i, %i, %i, %i]}", 1, 2, 3, 4);
+    EXPECT(rm.name() == "reduce_mean");
+}
 int main(int argc, const char* argv[]) { test::run(argc, argv); }