Merge branch 'develop' into mlir-c

df032e06 · Paul · cf4642cd · 19f65e7e · df032e06 · df032e06
Commit df032e06 authored Nov 16, 2021 by Paul
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/allocation_model.hpp
@@ -3,6 +3,7 @@
 #include <migraphx/config.hpp>
 #include <migraphx/operation.hpp>
+#include <migraphx/instruction_ref.hpp>
 #include <string>
 namespace migraphx {

--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -15,6 +15,10 @@ namespace gpu {
 std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);
+std::string enum_params(std::size_t count, std::string param);
+std::size_t compute_global(std::size_t n, std::size_t local = 1024);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_ops.hpp
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
+#include <migraphx/config.hpp>
+#include <string>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct module;
+namespace gpu {
+struct context;
+struct compile_ops
+{
+    context* ctx = nullptr;
+    std::string name() const { return "gpu::compile_ops"; }
+    void apply(module& m) const;
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_OPS_HPP
--- a/src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_pointwise.hpp
@@ -6,11 +6,17 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+struct module;
 namespace gpu {
 struct context;
-operation
+operation compile_pointwise(context& ctx,
-compile_pointwise(context& ctx, const std::vector<shape>& inputs, const std::string& lambda);
+                            const std::vector<shape>& inputs,
+                            const std::string& lambda,
+                            const std::string& preamble = "");
+operation compile_pointwise(context& ctx, const std::vector<shape>& inputs, module m);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/operation.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct context;
+operation compile_roialign(context& ctx, const std::vector<shape>& io_shapes, const value& val);
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -2,40 +2,51 @@
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ARRAY_HPP
 #include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/debug.hpp>
 namespace migraphx {
 // NOLINTNEXTLINE
-#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                               \
+#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                    \
-    constexpr array& operator op(const array& x)                              \
+    template <class U>                                                             \
-    {                                                                         \
+    constexpr array& operator op(const array<U, N>& x)                             \
-        for(index_int i = 0; i < N; i++)                                      \
+    {                                                                              \
-            d[i] op x[i];                                                     \
+        for(index_int i = 0; i < N; i++)                                           \
-        return *this;                                                         \
+            d[i] op x[i];                                                          \
-    }                                                                         \
+        return *this;                                                              \
-    constexpr array& operator op(const T& x)                                  \
+    }                                                                              \
-    {                                                                         \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
-        for(index_int i = 0; i < N; i++)                                      \
+    constexpr array& operator op(const U& x)                                       \
-            d[i] op x;                                                        \
+    {                                                                              \
-        return *this;                                                         \
+        for(index_int i = 0; i < N; i++)                                           \
-    }                                                                         \
+            d[i] op x;                                                             \
-    friend constexpr array operator binary_op(const array& x, const array& y) \
+        return *this;                                                              \
-    {                                                                         \
+    }                                                                              \
-        auto z = x;                                                           \
+    template <class U>                                                             \
-        return z op y;                                                        \
+    friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
-    }                                                                         \
+    {                                                                              \
-    friend constexpr array operator binary_op(const array& x, const T& y)     \
+        array<decltype(T {} binary_op U{}), N> z{};                                \
-    {                                                                         \
+        for(index_int i = 0; i < N; i++)                                           \
-        auto z = x;                                                           \
+            z[i] = x[i] binary_op y[i];                                            \
-        return z op y;                                                        \
+        return z;                                                                  \
-    }                                                                         \
+    }                                                                              \
-    friend constexpr array operator binary_op(const T& x, const array& y)     \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
-    {                                                                         \
+    friend constexpr auto operator binary_op(const array& x, const U& y)           \
-        for(index_int i = 0; i < N; i++)                                      \
+    {                                                                              \
-            y[i] = x op y[i];                                                 \
+        array<decltype(T {} binary_op U{}), N> z{};                                \
-        return y;                                                             \
+        for(index_int i = 0; i < N; i++)                                           \
+            z[i] = x[i] binary_op y;                                               \
+        return z;                                                                  \
+    }                                                                              \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
+    friend constexpr auto operator binary_op(const U& x, const array& y)           \
+    {                                                                              \
+        array<decltype(T {} binary_op U{}), N> z{};                                \
+        for(index_int i = 0; i < N; i++)                                           \
+            z[i] = x binary_op y[i];                                               \
+        return z;                                                                  \
    }
 template <class T, index_int N>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/basic_ops.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/basic_ops.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
+#include <migraphx/kernels/types.hpp>
+namespace migraphx {
+struct sum
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return x + y;
+    }
+};
+struct product
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return x * y;
+    }
+};
+struct id
+{
+    template <class T>
+    constexpr auto operator()(T x) const
+    {
+        return x;
+    }
+};
+struct mean
+{
+    size_t item_num = 1;
+    template <class T>
+    constexpr auto operator()(T x) const
+    {
+        return x / static_cast<T>(item_num);
+    }
+};
+struct max_f
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return (x > y) ? x : y;
+    }
+};
+inline constexpr auto max = max_f{};
+struct min_f
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return (x < y) ? x : y;
+    }
+};
+inline constexpr auto min = min_f{};
+struct lowest
+{
+    template <class T>
+    constexpr operator T() const
+    {
+        return std::numeric_limits<T>::lowest();
+    }
+};
+struct highest
+{
+    template <class T>
+    constexpr operator T() const
+    {
+        return std::numeric_limits<T>::max();
+    }
+};
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/dfor.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/dfor.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
+namespace migraphx {
+// Multidimensional for loop
+inline constexpr auto dfor()
+{
+    return [](auto f) { f(); };
+}
+template <class T, class... Ts>
+constexpr auto dfor(T x, Ts... xs)
+{
+    return [=](auto f) {
+        for(T i = 0; i < x; i++)
+        {
+            dfor(xs...)([&](Ts... is) { f(i, is...); });
+        }
+    };
+}
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -23,7 +23,7 @@ __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 template <class F, class... Ts>
 __device__ void pointwise(F f, Ts*... ps)
 {
-    auto t = transform_args(make_tensors(), rotate_last(), auto_vectorize());
+    auto t = transform_args(make_tensors(), rotate_last());
    t(ps...)([&](auto... xs) {
        auto idx = make_index();
        pointwise_tensor(idx, f, xs...);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/dfor.hpp>
+#include <migraphx/kernels/basic_ops.hpp>
+#include <args.hpp>
+namespace migraphx {
+struct max_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return lowest(); }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return max(x, y);
+    }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t)
+    {
+        return (x);
+    }
+};
+struct avg_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return 0.0; }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return x + y;
+    }
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t y)
+    {
+        return (y == 0) ? 0.0 : (x / y);
+    }
+};
+template <class T, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR T bilinear_interpolate(const T* data,
+                                                 const array<std::size_t, 2>& dims,
+                                                 array<float, 2> xy,
+                                                 Op pooling)
+{
+    array<int, 2> low{};
+    array<int, 2> high{};
+    for(std::size_t ii = 0; ii < xy.size(); ++ii)
+    {
+        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
+        {
+            return 0;
+        }
+        xy[ii]   = max(xy[ii], 0.0f);
+        low[ii]  = xy[ii];
+        high[ii] = low[ii] + 1;
+        if(low[ii] >= dims[ii] - 1)
+        {
+            xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+        }
+    }
+    array<std::size_t, 4> locs = {low[0] * dims[1] + low[1],
+                                  low[0] * dims[1] + high[1],
+                                  high[0] * dims[1] + low[1],
+                                  high[0] * dims[1] + high[1]};
+    float ly       = xy[0] - low[0];
+    float lx       = xy[1] - low[1];
+    float hy       = 1.0f - ly;
+    float hx       = 1.0f - lx;
+    array<T, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
+    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
+    return pooling(v01, v23);
+}
+template <class T, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR T calc_pooling(const T*& data,
+                                         const array<float, 2>& roi_starts,
+                                         const array<float, 2>& bin_size,
+                                         const array<int, 2>& idx,
+                                         const array<std::size_t, 2>& bin_grid_size,
+                                         const array<std::size_t, 2>& dims,
+                                         float roi_offset,
+                                         Op op)
+{
+    T output_val        = op.init();
+    const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
+        array<std::size_t, 2> id = {iy, ix};
+        array<float, 2> locs =
+            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;
+        auto val   = bilinear_interpolate(data, dims, locs, op);
+        output_val = op(output_val, val);
+    });
+    return op.final(output_val, count);
+}
+template <class T, class U, class V, class W>
+__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W& y_t)
+{
+    const float roi_offset       = ROIS_OFFSET;
+    const bool is_avg_pooling    = IS_AVG_POOLING;
+    const int64_t sampling_ratio = SAMPLING_RATIO;
+    const float spatial_scale    = SPATIAL_SCALE;
+    auto index       = make_index();
+    const auto* x    = x_t.data();
+    const auto* rois = rois_t.data();
+    const auto* ind  = ind_t.data();
+    auto* out_ptr = y_t.data();
+    // input shape
+    auto x_lens      = x_t.get_shape().lens;
+    auto channel_num = x_lens[1];
+    // input dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+    const auto stride   = index.nglobal();
+    auto out_s          = y_t.get_shape();
+    auto roi_column_num = rois_t.get_shape().lens[1];
+    // output dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    const auto& out_lens           = out_s.lens;
+    array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+    for(index_int i = index.global; i < out_s.elements(); i += stride)
+    {
+        auto idx = out_s.multi(i);
+        int n    = idx[0];
+        int c    = idx[1];
+        int ph   = idx[2];
+        int pw   = idx[3];
+        const auto* offset_rois = rois + (n * roi_column_num);
+        const int batch_ind     = ind[n];
+        array<float, 2> roi_starts = {offset_rois[1] * spatial_scale,
+                                      offset_rois[0] * spatial_scale};
+        array<float, 2> roi_ends = {offset_rois[3] * spatial_scale, offset_rois[2] * spatial_scale};
+        array<float, 2> roi_size{};
+        array<float, 2> bin_size{};
+        array<std::size_t, 2> bin_grid_size{};
+        for(std::size_t ii = 0; ii < roi_size.size(); ++ii)
+        {
+            roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+            roi_size[ii] = max(roi_size[ii], 1.0f);
+            bin_size[ii] = roi_size[ii] / out_dims[ii];
+            bin_grid_size[ii] =
+                (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_size[ii] / out_dims[ii]);
+        }
+        const auto* offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+        if constexpr(is_avg_pooling)
+        {
+            out_ptr[i] = calc_pooling(offset_x,
+                                      roi_starts,
+                                      bin_size,
+                                      {ph, pw},
+                                      bin_grid_size,
+                                      in_dims,
+                                      roi_offset,
+                                      avg_pool{});
+        }
+        else
+        {
+            out_ptr[i] = calc_pooling(offset_x,
+                                      roi_starts,
+                                      bin_size,
+                                      {ph, pw},
+                                      bin_grid_size,
+                                      in_dims,
+                                      roi_offset,
+                                      max_pool{});
+        }
+    }
+}
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+namespace migraphx {
+template <bool B, class T = void>
+struct enable_if
+{
+};
+template <class T>
+struct enable_if<true, T>
+{
+    using type = T;
+};
+template <bool B, class T = void>
+using enable_if_t = typename enable_if<B, T>::type;
+template <class From, class To>
+struct is_convertible : bool_constant<__is_convertible(From, To)>
+{
+};
+#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/types.hpp
@@ -12,6 +12,8 @@ using index_int = std::uint32_t;
 template <class T, index_int N>
 using vec = T __attribute__((ext_vector_type(N)));
+using half = _Float16;
 } // namespace migraphx
 #endif
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -20,6 +20,7 @@
 #include <migraphx/gpu/abs.hpp>
 #include <migraphx/gpu/batch_norm_inference.hpp>
+#include <migraphx/gpu/compile_roialign.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/deconvolution.hpp>
@@ -182,6 +183,8 @@ struct miopen_apply
        add_extend_op("softmax");
        add_extend_op("topk");
+        add_precompile_op("pointwise");
        add_batch_norm_inference_op();
        add_convolution_op();
        add_deconvolution_op();
@@ -190,7 +193,9 @@ struct miopen_apply
        add_if_op();
        add_loop_op();
        add_neg_op();
+        add_nms_op();
        add_quant_convolution_op();
+        add_roialign();
    }
    void copy_params()
@@ -378,6 +383,21 @@ struct miopen_apply
        });
    }
+    void add_precompile_op(const std::string& name)
+    {
+        apply_map.emplace(name, [=](instruction_ref ins) {
+            auto output                       = insert_allocation(ins, ins->get_shape());
+            std::vector<instruction_ref> refs = ins->inputs();
+            refs.push_back(output);
+            return mod->replace_instruction(
+                ins,
+                make_op("gpu::precompile_op", {{"op", to_value(ins->get_operator())}}),
+                refs,
+                ins->module_inputs());
+        });
+    }
    void add_batch_norm_inference_op()
    {
        apply_map.emplace("batch_norm_inference", [=](instruction_ref ins) {
@@ -469,6 +489,22 @@ struct miopen_apply
        });
    }
+    void add_roialign()
+    {
+        apply_map.emplace("roialign", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto op_val = ins->get_operator().to_value();
+            auto output = insert_allocation(ins, s);
+            auto args   = ins->inputs();
+            args.push_back(output);
+            auto io_shapes = to_shapes(args);
+            auto co        = compile_roialign(get_context(), io_shapes, op_val);
+            return mod->replace_instruction(ins, co, args);
+        });
+    }
    // replace the loop operator with gpu_loop operator
    void add_loop_op()
    {
@@ -506,6 +542,26 @@ struct miopen_apply
                ins, make_op("gpu::loop", ins->get_operator().to_value()), inputs, mod_args);
        });
    }
+    void add_nms_op()
+    {
+        apply_map.emplace("nonmaxsuppression", [=](instruction_ref ins) {
+            auto s      = ins->get_shape();
+            auto output = insert_allocation(ins, s);
+            std::vector<instruction_ref> cpu_inputs;
+            auto inputs = ins->inputs();
+            std::transform(
+                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
+                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
+                });
+            cpu_inputs.front() =
+                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
+            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
+            auto gpu_out =
+                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
+            return mod->replace_instruction(ins, gpu_out);
+        });
+    }
 };
 void lowering::apply(module& m) const { miopen_apply{&m, this}.apply(); }

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -9,6 +9,7 @@
 #include <migraphx/eliminate_data_type.hpp>
 #include <migraphx/eliminate_identity.hpp>
 #include <migraphx/eliminate_pad.hpp>
+#include <migraphx/fuse_pointwise.hpp>
 #include <migraphx/inline_module.hpp>
 #include <migraphx/insert_pad.hpp>
 #include <migraphx/memory_coloring.hpp>
@@ -25,6 +26,7 @@
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/gpu/allocation_model.hpp>
+#include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/eliminate_workspace.hpp>
@@ -42,6 +44,20 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_POINTWISE_FUSION)
+struct id_pass
+{
+    std::string name() const { return "id"; }
+    void apple(const module&) const {}
+};
+pass enable_pass(bool enabled, pass p)
+{
+    if(enabled)
+        return p;
+    return id_pass{};
+}
 std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_options& options) const
 {
@@ -84,6 +100,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        simplify_reshapes{},
        propagate_constant{},
        dead_code_elimination{},
+        enable_pass(enabled(MIGRAPHX_ENABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
+        dead_code_elimination{},
        mlir_conv{&ctx},
        lowering{&ctx, options.offload_copy},
        eliminate_contiguous{"gpu::contiguous"},
@@ -96,6 +114,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
+        compile_ops{&ctx},
+        dead_code_elimination{},
        write_literals{&ctx},
        schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}, not enabled(MIGRAPHX_DISABLE_SCHEDULE_PASS{})},
        memory_coloring{"hip::allocate"},

--- a/test/fuse_pointwise.cpp
+++ b/test/fuse_pointwise.cpp
@@ -60,9 +60,9 @@ TEST_CASE(single)
        auto x    = mm->add_parameter("x", s);
        auto y    = mm->add_parameter("y", s);
        auto z    = mm->add_parameter("z", s);
-        auto add1 = add_pointwise(p2, "pointwise0", {x, y}, single_pointwise("add"));
+        auto add1 = add_pointwise(p2, "main:pointwise0", {x, y}, single_pointwise("add"));
        auto pass = mm->add_instruction(pass_op{}, add1);
-        auto add2 = add_pointwise(p2, "pointwise1", {pass, z}, single_pointwise("add"));
+        auto add2 = add_pointwise(p2, "main:pointwise1", {pass, z}, single_pointwise("add"));
        mm->add_return({add2});
    }
    EXPECT(p1 == p2);
@@ -84,14 +84,15 @@ TEST_CASE(double_add)
    run_pass(p1);
    migraphx::program p2;
    {
-        auto* mm  = p2.get_main_module();
+        auto* mm = p2.get_main_module();
-        auto x    = mm->add_parameter("x", s);
+        auto x   = mm->add_parameter("x", s);
-        auto y    = mm->add_parameter("y", s);
+        auto y   = mm->add_parameter("y", s);
-        auto z    = mm->add_parameter("z", s);
+        auto z   = mm->add_parameter("z", s);
-        auto fadd = add_pointwise(p2, "pointwise0", {x, y, z}, [=](auto* pm, const auto& inputs) {
+        auto fadd =
-            auto add1 = pm->add_instruction(migraphx::make_op("add"), inputs[0], inputs[1]);
+            add_pointwise(p2, "main:pointwise0", {x, y, z}, [=](auto* pm, const auto& inputs) {
-            return pm->add_instruction(migraphx::make_op("add"), add1, inputs[2]);
+                auto add1 = pm->add_instruction(migraphx::make_op("add"), inputs[0], inputs[1]);
-        });
+                return pm->add_instruction(migraphx::make_op("add"), add1, inputs[2]);
+            });
        mm->add_return({fadd});
    }
    EXPECT(p1.sort() == p2.sort());
@@ -117,10 +118,10 @@ TEST_CASE(used_twice_not_fused)
        auto* mm  = p2.get_main_module();
        auto x    = mm->add_parameter("x", s);
        auto y    = mm->add_parameter("y", s);
-        auto add1 = add_pointwise(p2, "pointwise0", {x, y}, single_pointwise("add"));
+        auto add1 = add_pointwise(p2, "main:pointwise0", {x, y}, single_pointwise("add"));
        auto pass = mm->add_instruction(pass_op{}, add1);
-        auto fadd =
+        auto fadd = add_pointwise(
-            add_pointwise(p2, "pointwise1", {add1, y, pass}, [=](auto* pm, const auto& inputs) {
+            p2, "main:pointwise1", {add1, y, pass}, [=](auto* pm, const auto& inputs) {
                auto add2 = pm->add_instruction(migraphx::make_op("add"), inputs[0], inputs[1]);
                return pm->add_instruction(migraphx::make_op("add"), inputs[2], add2);
            });
@@ -149,7 +150,7 @@ TEST_CASE(used_twice_fused)
        auto* mm  = p2.get_main_module();
        auto x    = mm->add_parameter("x", s);
        auto y    = mm->add_parameter("y", s);
-        auto fadd = add_pointwise(p2, "pointwise0", {x, y}, [=](auto* pm, const auto& inputs) {
+        auto fadd = add_pointwise(p2, "main:pointwise0", {x, y}, [=](auto* pm, const auto& inputs) {
            auto add1 = pm->add_instruction(migraphx::make_op("add"), inputs[0], inputs[1]);
            auto add2 = pm->add_instruction(migraphx::make_op("add"), add1, inputs[0]);
            auto add3 = pm->add_instruction(migraphx::make_op("add"), add1, inputs[1]);
@@ -179,11 +180,11 @@ TEST_CASE(duplicate_inputs)
        auto* mm  = p2.get_main_module();
        auto x    = mm->add_parameter("x", s);
        auto y    = mm->add_parameter("y", s);
-        auto add1 = add_pointwise(p2, "pointwise0", {x}, [=](auto* pm, const auto& inputs) {
+        auto add1 = add_pointwise(p2, "main:pointwise0", {x}, [=](auto* pm, const auto& inputs) {
            return pm->add_instruction(migraphx::make_op("add"), inputs[0], inputs[0]);
        });
        auto pass = mm->add_instruction(pass_op{}, add1);
-        auto add2 = add_pointwise(p2, "pointwise1", {pass, y}, single_pointwise("add"));
+        auto add2 = add_pointwise(p2, "main:pointwise1", {pass, y}, single_pointwise("add"));
        mm->add_return({add2});
    }
    EXPECT(p1.sort() == p2.sort());
@@ -207,7 +208,35 @@ TEST_CASE(scalar_input)
    {
        auto* mm  = p2.get_main_module();
        auto x    = mm->add_parameter("x", s);
-        auto add1 = add_pointwise(p2, "pointwise0", {x}, [=](auto* pm, const auto& inputs) {
+        auto add1 = add_pointwise(p2, "main:pointwise0", {x}, [=](auto* pm, const auto& inputs) {
+            auto y = pm->add_literal(1.0f);
+            return pm->add_instruction(migraphx::make_op("add"), inputs[0], y);
+        });
+        mm->add_return({add1});
+    }
+    EXPECT(p1 == p2);
+}
+TEST_CASE(contiguous_input)
+{
+    migraphx::shape s{migraphx::shape::float_type, {2, 3}};
+    migraphx::program p1;
+    {
+        auto* mm = p1.get_main_module();
+        auto x   = mm->add_parameter("x", s);
+        auto one = mm->add_literal(1.0f);
+        auto yb =
+            mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), one);
+        auto y    = mm->add_instruction(migraphx::make_op("contiguous"), yb);
+        auto add1 = mm->add_instruction(migraphx::make_op("add"), x, y);
+        mm->add_return({add1});
+    }
+    run_pass(p1);
+    migraphx::program p2;
+    {
+        auto* mm  = p2.get_main_module();
+        auto x    = mm->add_parameter("x", s);
+        auto add1 = add_pointwise(p2, "main:pointwise0", {x}, [=](auto* pm, const auto& inputs) {
            auto y = pm->add_literal(1.0f);
            return pm->add_instruction(migraphx::make_op("add"), inputs[0], y);
        });
@@ -216,4 +245,32 @@ TEST_CASE(scalar_input)
    EXPECT(p1 == p2);
 }
+TEST_CASE(all_scalar_input)
+{
+    migraphx::shape s{migraphx::shape::float_type};
+    migraphx::program p1;
+    {
+        auto* mm  = p1.get_main_module();
+        auto x    = mm->add_parameter("x", s);
+        auto y    = mm->add_parameter("y", s);
+        auto add1 = mm->add_instruction(migraphx::make_op("add"), x, y);
+        mm->add_return({add1});
+    }
+    run_pass(p1);
+    migraphx::program p2;
+    {
+        auto* mm  = p2.get_main_module();
+        auto x    = mm->add_parameter("x", s);
+        auto y    = mm->add_parameter("y", s);
+        auto add1 = add_pointwise(p2, "main:pointwise0", {x, y}, [=](auto* pm, const auto& inputs) {
+            return pm->add_instruction(migraphx::make_op("add"), inputs[0], inputs[1]);
+        });
+        mm->add_return({add1});
+    }
+    EXPECT(p1.get_output_shapes().size() == 1);
+    EXPECT(p1.get_output_shapes().front().scalar());
+    EXPECT(p1.get_output_shapes() == p2.get_output_shapes());
+    EXPECT(p1 == p2);
+}
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/msgpack.cpp
+++ b/test/msgpack.cpp
@@ -11,7 +11,7 @@ std::vector<char> msgpack_buffer(const T& src)
    msgpack::pack(buffer, src);
    buffer.seekg(0);
    std::string str = buffer.str();
-    return std::vector<char>(str.data(), str.data() + str.size());
+    return std::vector<char>(str.data(), str.data() + str.size()); // NOLINT
 }
 TEST_CASE(test_msgpack_empty_value)

--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -2771,6 +2771,31 @@ def neg_test():
    return ([node], [x], [y])
+@onnx_test
+def nms_test():
+    b = helper.make_tensor_value_info('boxes', TensorProto.FLOAT, [1, 6, 4])
+    s = helper.make_tensor_value_info('scores', TensorProto.FLOAT, [1, 1, 6])
+    mo = helper.make_tensor_value_info('max_output_boxes_per_class',
+                                       TensorProto.INT64, [1])
+    iou = helper.make_tensor_value_info('iou_threshold', TensorProto.FLOAT,
+                                        [1])
+    st = helper.make_tensor_value_info('score_threshold', TensorProto.FLOAT,
+                                       [1])
+    out = helper.make_tensor_value_info('selected_indices', TensorProto.INT64,
+                                        [6, 3])
+    node = onnx.helper.make_node('NonMaxSuppression',
+                                 inputs=[
+                                     'boxes', 'scores',
+                                     'max_output_boxes_per_class',
+                                     'iou_threshold', 'score_threshold'
+                                 ],
+                                 outputs=['selected_indices'],
+                                 center_point_box=1)
+    return ([node], [b, s, mo, iou, st], [out])
 @onnx_test
 def not_test():
    x = helper.make_tensor_value_info('0', TensorProto.INT32, [4])
@@ -3835,6 +3860,7 @@ def resize_upsample_pf_test():
    return ([node], [X], [Y], [scale_tensor])
+@onnx_test
 def resize_upsample_pc_test():
    scales = np.array([1.0, 1.0, 2.0, 1.5], dtype=np.float32)
    scale_tensor = helper.make_tensor(name='scales',
@@ -3857,6 +3883,41 @@ def resize_upsample_pc_test():
    return ([node], [X], [Y], [scale_tensor])
+@onnx_test
+def roialign_default_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 4, 7, 8])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [8, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [8])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [8, 4, 1, 1])
+    node = onnx.helper.make_node('RoiAlign',
+                                 inputs=['x', 'rois', 'batch_ind'],
+                                 outputs=['y'])
+    return ([node], [x, roi, bi], [y])
+@onnx_test
+def roialign_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [10, 5, 4, 7])
+    roi = helper.make_tensor_value_info('rois', TensorProto.FLOAT, [8, 4])
+    bi = helper.make_tensor_value_info('batch_ind', TensorProto.INT64, [8])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [8, 4, 5, 5])
+    node = onnx.helper.make_node(
+        'RoiAlign',
+        inputs=['x', 'rois', 'batch_ind'],
+        outputs=['y'],
+        spatial_scale=2.0,
+        output_height=5,
+        output_width=5,
+        sampling_ratio=3,
+        mode="avg",
+        coordinate_transformation_mode="output_half_pixel")
+    return ([node], [x, roi, bi], [y])
 @onnx_test
 def scatter_test():
    x = helper.make_tensor_value_info('data', TensorProto.FLOAT, [3, 4, 5, 6])

--- a/test/onnx/nms_test.onnx
+++ b/test/onnx/nms_test.onnx
+nms_test:
+boxes
+scores
+max_output_boxes_per_class
+
iou_threshold
+score_thresholdselected_indices"NonMaxSuppression*
+center_point_boxnms_testZ
+boxes
+Z
+scores
+Z(
+max_output_boxes_per_class
+Z
+
iou_threshold
+Z
+score_threshold
+b"
+selected_indices
+B
\ No newline at end of file
--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -2444,6 +2444,33 @@ TEST_CASE(neg_test)
    EXPECT(p == prog);
 }
+TEST_CASE(nms_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape sb{migraphx::shape::float_type, {1, 6, 4}};
+    auto b = mm->add_parameter("boxes", sb);
+    migraphx::shape ss{migraphx::shape::float_type, {1, 1, 6}};
+    auto s = mm->add_parameter("scores", ss);
+    migraphx::shape smo{migraphx::shape::int64_type, {1}};
+    auto mo = mm->add_parameter("max_output_boxes_per_class", smo);
+    migraphx::shape siou{migraphx::shape::float_type, {1}};
+    auto iou = mm->add_parameter("iou_threshold", siou);
+    migraphx::shape sst{migraphx::shape::float_type, {1}};
+    auto st = mm->add_parameter("score_threshold", sst);
+    auto ret = mm->add_instruction(
+        migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}), b, s, mo, iou, st);
+    mm->add_return({ret});
+    auto prog = migraphx::parse_onnx("nms_test.onnx");
+    EXPECT(p == prog);
+}
 TEST_CASE(nonzero_dynamic_test)
 {
    migraphx::program p;
@@ -3640,6 +3667,55 @@ TEST_CASE(resize_upsample_pf_test)
    EXPECT(p == prog);
 }
+TEST_CASE(roialign_default_test)
+{
+    migraphx::shape sx{migraphx::shape::float_type, {10, 4, 7, 8}};
+    migraphx::shape srois{migraphx::shape::float_type, {8, 4}};
+    migraphx::shape sbi{migraphx::shape::int64_type, {8}};
+    migraphx::program p;
+    auto* mm  = p.get_main_module();
+    auto x    = mm->add_parameter("x", sx);
+    auto rois = mm->add_parameter("rois", srois);
+    auto bi   = mm->add_parameter("batch_ind", sbi);
+    auto r = mm->add_instruction(migraphx::make_op("roialign"), x, rois, bi);
+    mm->add_return({r});
+    auto prog = migraphx::parse_onnx("roialign_default_test.onnx");
+    EXPECT(p == prog);
+}
+TEST_CASE(roialign_test)
+{
+    migraphx::shape sx{migraphx::shape::float_type, {10, 5, 4, 7}};
+    migraphx::shape srois{migraphx::shape::float_type, {8, 4}};
+    migraphx::shape sbi{migraphx::shape::int64_type, {8}};
+    migraphx::program p;
+    auto* mm  = p.get_main_module();
+    auto x    = mm->add_parameter("x", sx);
+    auto rois = mm->add_parameter("rois", srois);
+    auto bi   = mm->add_parameter("batch_ind", sbi);
+    auto r = mm->add_instruction(
+        migraphx::make_op("roialign",
+                          {{"coordinate_transformation_mode", "output_half_pixel"},
+                           {"spatial_scale", 2.0f},
+                           {"output_height", 5},
+                           {"output_width", 5},
+                           {"sampling_ratio", 3}}),
+        x,
+        rois,
+        bi);
+    mm->add_return({r});
+    auto prog = migraphx::parse_onnx("roialign_test.onnx");
+    EXPECT(p == prog);
+}
 TEST_CASE(round_test)
 {
    migraphx::program p;

--- a/test/onnx/roialign_default_test.onnx
+++ b/test/onnx/roialign_default_test.onnx
+roialign_default_test:
+!
+x
+rois
+	batch_indy"RoiAlignroialign_default_testZ
+x
+Z
+rois
+Z
+	batch_ind
+b
+y
+B
\ No newline at end of file