Merge branch 'batch_report' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into mi100_opts

0369e974 · Khalique Ahmed · 3a474fca · d70fd0df · 0369e974 · 0369e974
Commit 0369e974 authored Nov 10, 2021 by Khalique Ahmed
20 changed files
--- a/src/onnx/parse_generic_op.cpp
+++ b/src/onnx/parse_generic_op.cpp
@@ -32,6 +32,7 @@ struct parse_generic_op : op_parser<parse_generic_op>
                {"Log", "log"},
                {"LRN", "lrn"},
                {"Neg", "neg"},
+                {"NonMaxSuppression", "nonmaxsuppression"},
                {"Reciprocal", "recip"},
                {"Relu", "relu"},
                {"Round", "round"},
@@ -49,7 +50,7 @@ struct parse_generic_op : op_parser<parse_generic_op>

    bool needs_contiguous(const std::string& op_name) const
    {
-        return contains({"flatten", "gather", "scatter"}, op_name);
+        return contains({"flatten", "gather", "nonmaxsuppression", "scatter"}, op_name);
    }

    instruction_ref parse(const op_desc& opd,

--- a/src/onnx/parse_matmul.cpp
+++ b/src/onnx/parse_matmul.cpp
@@ -66,10 +66,8 @@ struct parse_matmul : op_parser<parse_matmul>
                    make_op("multibroadcast", {{"out_lens", l1_broadcasted_lens}}), l1);
            }
        }
-
-        auto dot_res =
-            info.add_instruction(make_op(opd.op_name, {{"alpha", 1}, {"beta", 0}}), bl0, bl1);
-        int64_t num_axis = static_cast<int64_t>(dot_res->get_shape().lens().size());
+        instruction_ref dot_res = info.add_instruction(make_op(opd.op_name), bl0, bl1);
+        int64_t num_axis        = static_cast<int64_t>(dot_res->get_shape().lens().size());
        if(is_a_prepended)
        {
            dot_res = info.add_instruction(make_op("squeeze", {{"axes", {num_axis - 2}}}), dot_res);

--- a/src/onnx/parse_nonzero.cpp
+++ b/src/onnx/parse_nonzero.cpp
@@ -9,7 +9,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace onnx {

 template <class T>
-std::vector<std::size_t> nonzero_indices(const std::vector<T>& data)
+static std::vector<std::size_t> nonzero_indices(const std::vector<T>& data)
 {
    std::vector<std::size_t> indices;
    for(std::size_t i = 0; i < data.size(); ++i)
@@ -31,30 +31,35 @@ struct parse_nonzero : op_parser<parse_nonzero>
                          std::vector<instruction_ref> args) const
    {
        migraphx::argument data_arg = args.back()->eval();
-        check_arg_empty(data_arg, "PARSE_NONZERO: cannot support non-constant input!");
-
-        std::vector<std::size_t> indices;
-        data_arg.visit([&](auto val) {
-            using val_type = std::remove_cv_t<typename decltype(val)::value_type>;
-            std::vector<val_type> vec_data;
-            vec_data.assign(val.begin(), val.end());
-            indices = nonzero_indices(vec_data);
-        });
+        if(data_arg.empty())
+        {
+            return info.add_instruction(make_op("nonzero"), args);
+        }
+        else
+        {
+            std::vector<std::size_t> indices;
+            data_arg.visit([&](auto val) {
+                using val_type = std::remove_cv_t<typename decltype(val)::value_type>;
+                std::vector<val_type> vec_data;
+                vec_data.assign(val.begin(), val.end());
+                indices = nonzero_indices(vec_data);
+            });

-        shape in_s = args[0]->get_shape();
-        shape out_s{shape::int64_type, {in_s.lens().size(), indices.size()}};
+            shape in_s = args[0]->get_shape();
+            shape out_s{shape::int64_type, {in_s.lens().size(), indices.size()}};

-        std::vector<int64_t> out_data(out_s.elements());
-        for(std::size_t i = 0; i < indices.size(); ++i)
-        {
-            auto idx = in_s.multi(indices[i]);
-            for(std::size_t j = 0; j < in_s.lens().size(); ++j)
+            std::vector<int64_t> out_data(out_s.elements());
+            for(std::size_t i = 0; i < indices.size(); ++i)
            {
-                out_data[out_s.index({j, i})] = idx[j];
+                auto idx = in_s.multi(indices[i]);
+                for(std::size_t j = 0; j < in_s.lens().size(); ++j)
+                {
+                    out_data[out_s.index({j, i})] = idx[j];
+                }
            }
-        }

-        return info.add_literal(literal(out_s, out_data));
+            return info.add_literal(literal(out_s, out_data));
+        }
    }
 };


--- a/src/onnx/parse_pow.cpp
+++ b/src/onnx/parse_pow.cpp
@@ -9,21 +9,20 @@ namespace onnx {

 auto compute_type(shape::type_t t1, shape::type_t t2)
 {
-    const static std::unordered_map<int, int> op_order = {
-        {static_cast<int>(shape::int8_type), 1},
-        {static_cast<int>(shape::uint8_type), 2},
-        {static_cast<int>(shape::int16_type), 3},
-        {static_cast<int>(shape::uint16_type), 4},
-        {static_cast<int>(shape::int32_type), 5},
-        {static_cast<int>(shape::uint32_type), 6},
-        {static_cast<int>(shape::int64_type), 7},
-        {static_cast<int>(shape::uint64_type), 8},
-        {static_cast<int>(shape::half_type), 9},
-        {static_cast<int>(shape::float_type), 10},
-        {static_cast<int>(shape::double_type), 11}};
+    const static std::unordered_map<int, int> op_order = {{shape::int8_type, 1},
+                                                          {shape::uint8_type, 2},
+                                                          {shape::int16_type, 3},
+                                                          {shape::uint16_type, 4},
+                                                          {shape::int32_type, 5},
+                                                          {shape::uint32_type, 6},
+                                                          {shape::int64_type, 7},
+                                                          {shape::uint64_type, 8},
+                                                          {shape::half_type, 9},
+                                                          {shape::float_type, 10},
+                                                          {shape::double_type, 11}};

-    int it1 = static_cast<int>(t1);
-    int it2 = static_cast<int>(t2);
+    int it1 = t1;
+    int it2 = t2;
    if(!contains(op_order, it1) or !contains(op_order, it2))
    {
        MIGRAPHX_THROW("PARSE_POW: Input data type not supported!");

--- a/src/onnx/parse_resize.cpp
+++ b/src/onnx/parse_resize.cpp
@@ -334,7 +334,7 @@ struct parse_resize : op_parser<parse_resize>
                auto ins_delta = info.add_literal(dim_s, delta_data);

                // slice the data
-                int64_t slc_stride = static_cast<int64_t>(dim_lens[0]);
+                int64_t slc_stride = dim_lens[0];
                auto low           = info.add_instruction(
                    make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {slc_stride}}}),
                    data);

--- a/src/onnx/parse_roialign.cpp
+++ b/src/onnx/parse_roialign.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_roialign : op_parser<parse_roialign>
+{
+    std::vector<op_desc> operators() const { return {{"RoiAlign"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          onnx_parser::node_info info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        std::string coord_trans_mode = "half_pixel";
+        if(contains(info.attributes, "coordinate_transformation_mode"))
+        {
+            coord_trans_mode = info.attributes.at("coordinate_transformation_mode").s();
+        }
+        if(not contains({"half_pixel", "output_half_pixel"}, coord_trans_mode))
+        {
+            MIGRAPHX_THROW("coordinate_transformation_mode \"" + coord_trans_mode +
+                           "\": invalid value!");
+        }
+
+        std::string mode = "avg";
+        if(contains(info.attributes, "mode"))
+        {
+            mode = info.attributes.at("mode").s();
+        }
+
+        int64_t output_height = 1;
+        if(contains(info.attributes, "output_height"))
+        {
+            output_height = info.attributes.at("output_height").i();
+        }
+
+        int64_t output_width = 1;
+        if(contains(info.attributes, "output_width"))
+        {
+            output_width = info.attributes.at("output_width").i();
+        }
+
+        int64_t sampling_ratio = 0;
+        if(contains(info.attributes, "sampling_ratio"))
+        {
+            sampling_ratio = info.attributes.at("sampling_ratio").i();
+        }
+
+        float spatial_scale = 1.0f;
+        if(contains(info.attributes, "spatial_scale"))
+        {
+            spatial_scale = info.attributes.at("spatial_scale").f();
+        }
+
+        return info.add_instruction(make_op("roialign",
+                                            {{"coordinate_transformation_mode", coord_trans_mode},
+                                             {"mode", mode},
+                                             {"output_height", output_height},
+                                             {"output_width", output_width},
+                                             {"sampling_ratio", sampling_ratio},
+                                             {"spatial_scale", spatial_scale}}),
+                                    args);
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_spacetodepth.cpp
+++ b/src/onnx/parse_spacetodepth.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_spacetodepth : op_parser<parse_spacetodepth>
+{
+    std::vector<op_desc> operators() const { return {{"SpaceToDepth"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto s = args[0]->get_shape();
+        // blocksize attribute of SpaceToDepth
+        int blocksize = 1; // if blockSize of 1 then, this is a no-op
+        if(contains(info.attributes, "blocksize"))
+        {
+            blocksize = info.attributes.at("blocksize").i();
+        }
+        if(blocksize < 1)
+        {
+            // blockSize less than 1 would rather result in DepthToSpace instead of SpaceToDepth
+            MIGRAPHX_THROW("SpaceToDepth: blocksize is less than 1");
+        }
+        // calculate dimensions
+        auto res_lens = s.lens(); // {N, C, H, W}
+        if(((res_lens[2] % blocksize) == 0) and ((res_lens[3] % blocksize) == 0))
+        {
+            // Co = C * (blocksize ^ 2)
+            res_lens[1] = res_lens[1] * blocksize * blocksize;
+            // Ho = (H / blocksize)
+            res_lens[2] = res_lens[2] / blocksize;
+            // Wo = (W / blocksize)
+            res_lens[3] = res_lens[3] / blocksize;
+        } // res_shape = (N, Co, Ho, Wo)
+        else
+            MIGRAPHX_THROW("SpaceToDepth: div by blocksize quotient not int ");
+
+        auto trans_lens = s.lens(); // {N, C, H, W}
+        trans_lens[2]   = res_lens[2];
+        trans_lens[3]   = blocksize;
+        trans_lens.push_back(res_lens[3]);
+        trans_lens.push_back(blocksize); // {N, C, Ho, blocksize, Wo, blocksize}
+        std::vector<int64_t> perm = {0, 3, 5, 1, 2, 4};
+        auto temp1 = info.add_instruction(make_op("reshape", {{"dims", trans_lens}}), args[0]);
+        auto temp2 = info.add_instruction(make_op("transpose", {{"permutation", perm}}), temp1);
+        return info.add_instruction(make_op("reshape", {{"dims", res_lens}}),
+                                    info.make_contiguous(temp2));
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_split.cpp
+++ b/src/onnx/parse_split.cpp
@@ -24,7 +24,7 @@ struct parse_split : op_parser<parse_split>
        }

        auto lens          = args[0]->get_shape().lens();
-        int64_t n_rank     = static_cast<int64_t>(lens.size());
+        int64_t n_rank     = lens.size();
        int64_t tuned_axis = tune_axis(n_rank, axis, opd.op_name);

        std::vector<int64_t> vec_splits;

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -13,6 +13,7 @@
 #include <migraphx/algorithm.hpp>
 #include <migraphx/output_iterator.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/marker.hpp>
 #include <iostream>
 #include <sstream>
 #include <algorithm>
@@ -309,8 +310,11 @@ std::vector<argument> program::eval(parameter_map params) const
                                double t2 = t.record<milliseconds>();
                                std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
                                if(trace_level > 1 and ins->name().front() != '@' and
-                                   ins->name() != "load")
-                                    std::cout << "Output: " << result << std::endl;
+                                   ins->name() != "load" and not result.empty())
+                                {
+                                    target tgt = make_target(this->impl->target_name);
+                                    std::cout << "Output: " << tgt.copy_from(result) << std::endl;
+                                }
                                return result;
                            }));
    }
@@ -504,7 +508,28 @@ std::string perf_group(const operation& op)
    return op.name();
 }

-void program::perf_report(std::ostream& os, std::size_t n, parameter_map params) const
+void program::mark(const parameter_map& params, marker&& m)
+{
+    auto& ctx = this->impl->ctx;
+    // Run once by itself
+    eval(params);
+    ctx.finish();
+    // Start marking
+    m.mark_start(*this);
+    generic_eval(*this, ctx, params, always([&](auto ins, auto f) {
+        argument result;
+        m.mark_start(ins);
+        result = f();
+        m.mark_stop(ins);
+        return result;
+    }));
+    m.mark_stop(*this);
+}
+
+void program::perf_report(std::ostream& os,
+                          std::size_t n,
+                          parameter_map params,
+                          std::size_t batch) const
 {
    auto& ctx = this->impl->ctx;
    // Run once by itself
@@ -597,7 +622,8 @@ void program::perf_report(std::ostream& os, std::size_t n, parameter_map params)

    os << std::endl;

-    os << "Rate: " << rate << "/sec" << std::endl;
+    os << "Batch size: " << batch << std::endl;
+    os << "Rate: " << rate * batch << "/sec" << std::endl;
    os << "Total time: " << total_time << "ms" << std::endl;
    os << "Total instructions time: " << total_instruction_time << "ms" << std::endl;
    os << "Overhead time: " << overhead_time << "ms"

--- a/src/remap.cpp
+++ b/src/remap.cpp
-#include <migraphx/remap.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/functional.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/float_equal.hpp>
-#include <migraphx/matcher.hpp>
-#include <migraphx/op/dot.hpp>
-#include <migraphx/op/add.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace {
-struct find_dot_add
-{
-    auto matcher() const
-    {
-        return match::name("add")(match::any_of(
-            match::args(match::name("dot")(match::nargs(2)).bind("dot"), match::any().bind("a")),
-            match::args(match::used_once().bind("a"),
-                        match::name("dot")(match::nargs(2)).bind("dot"))));
-    }
-
-    void apply(module& p, match::matcher_result r) const
-    {
-        auto ins     = r.result;
-        auto dot_ins = r.instructions["dot"];
-        auto a_ins   = r.instructions["a"];
-
-        auto dot = any_cast<op::dot>(dot_ins->get_operator());
-
-        dot.beta = 1;
-        p.replace_instruction(ins, dot, dot_ins->inputs()[0], dot_ins->inputs()[1], a_ins);
-    }
-};
-} // namespace
-
-void remap::apply(module& p) const { match::find_matches(p, find_dot_add{}); }
-
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -269,7 +269,7 @@ std::vector<instruction_ref> rewrite_rnn::vanilla_rnn_cell(bool is_forward,
    instruction_ref hidden_out = prog.end();
    instruction_ref last_out{};
    last_out     = prog.insert_instruction(ins, make_op("unsqueeze", {{"axes", {0, 1}}}), sih);
-    long seq_len = static_cast<long>(get_seq_len(prog, seq, seq_lens));
+    long seq_len = get_seq_len(prog, seq, seq_lens);
    for(long i = 0; i < seq_len; i++)
    {
        long seq_index = is_forward ? i : (seq_len - 1 - i);
@@ -556,7 +556,7 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
    instruction_ref last_output{};
    migraphx::shape seq_shape = seq->get_shape();
    migraphx::shape r_shape   = r->get_shape();
-    long hs                   = static_cast<long>(r_shape.lens()[2]);
+    long hs                   = r_shape.lens()[2];

    migraphx::shape ss(seq_shape.type(), {seq_shape.lens()[1], r_shape.lens()[2]});
    std::vector<float> data(ss.elements(), 1.0f);
@@ -613,7 +613,7 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
            rb_h);
    }

-    long seq_len = static_cast<long>(get_seq_len(prog, seq, seq_lens));
+    long seq_len = get_seq_len(prog, seq, seq_lens);
    for(long i = 0; i < seq_len; i++)
    {
        long seq_index = is_forward ? i : (seq_len - 1 - i);
@@ -1032,7 +1032,7 @@ std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
    instruction_ref last_cell_output{};

    migraphx::shape r_shape = r->get_shape();
-    long hs                 = static_cast<long>(r_shape.lens()[2]);
+    long hs                 = r_shape.lens()[2];
    auto bs                 = ih->get_shape().lens()[1];

    std::vector<int64_t> perm{1, 0};
@@ -1094,7 +1094,7 @@ std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
            ins, make_op("broadcast", {{"axis", 1}, {"out_lens", ic_lens}}), pphf);
    }

-    long seq_len = static_cast<long>(get_seq_len(prog, seq, seq_lens));
+    long seq_len = get_seq_len(prog, seq, seq_lens);
    for(long i = 0; i < seq_len; ++i)
    {
        long seq_index = is_forward ? i : (seq_len - 1 - i);

--- a/src/simplify_qdq.cpp
+++ b/src/simplify_qdq.cpp
@@ -84,13 +84,7 @@ struct match_find_quantizable_ops
        }
        else if(qop->name() == "dot")
        {
-            auto dot_op = any_cast<op::dot>(qop->get_operator());
-            if(!(float_equal(dot_op.alpha, 1.0f) and float_equal(dot_op.beta, 0.0f)))
-                return;
-            if(qop_args.size() == 3)
-                qop_args.pop_back();
-            dq = m.insert_instruction(
-                qop, migraphx::make_op("quant_dot", {{"alpha", 1}, {"beta", 0}}), qop_args);
+            dq = m.insert_instruction(qop, migraphx::make_op("quant_dot"), qop_args);
        }
        auto ins_type = qop->get_shape().type();
        dq_scale      = m.add_literal(literal({ins_type}, {scale}));

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -539,6 +539,46 @@ struct find_reshape_cont
    }
 };

+// match sequence of transpose --> contiguous --> reshaper_op
+auto match_transpose_contiguous_reshaper()
+{
+    return match::name({"reshape", "squeeze", "unsqueeze"})(
+               match::used_once(),
+               match::args(
+                   match::name("contiguous")(
+                       match::used_once(), match::args(match::transpose_shape().bind("trans_ins")))
+                       .bind("cont_ins")))
+        .bind("reshaper_ins");
+};
+
+// finds the pattern of transpose --> contiguous --> reshaper_op --> unary
+// application of this matcher moves the unary operation before the contiguous so it becomes
+// transpose --> unary --> contiguous --> reshaper_op. later pointwise sub-module can be created out
+// of unary --> contiguous --> reshaper_op. Such pattern appears in depthToSpace or spaceToDepth
+// operator.
+struct find_transpose_contiguous_reshaper_unary
+{
+    auto matcher() const
+    {
+        return pointwise(match::used_once(),
+                         match::nargs(1),
+                         match::args(match_transpose_contiguous_reshaper()));
+    }
+
+    void apply(module& p, match::matcher_result r) const
+    {
+        auto ins           = r.result;
+        auto reshaper_ins  = r.instructions["reshaper_ins"];
+        auto trans_ins     = r.instructions["trans_ins"];
+        auto cont_ins      = r.instructions["cont_ins"];
+        auto unary_op_name = ins->get_operator().name();
+        auto unary_ins     = p.insert_instruction(cont_ins, make_op(unary_op_name), trans_ins);
+        auto new_cont_ins  = p.insert_instruction(cont_ins, make_op("contiguous"), unary_ins);
+        // older cont and reshape are removed by deadcode elimination
+        p.replace_instruction(ins, reshaper_ins->get_operator(), new_cont_ins);
+    }
+};
+
 void simplify_reshapes::apply(module& p) const
 {
    for(int i = 0; i < 2; i++)
@@ -553,7 +593,8 @@ void simplify_reshapes::apply(module& p) const
                            find_concat_transpose{},
                            find_nested_convert{},
                            find_nested_slice{},
-                            find_nested_concat{});
+                            find_nested_concat{},
+                            find_transpose_contiguous_reshaper_unary{});
        dead_code_elimination{}.apply(p);
    }
 }

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -33,8 +33,6 @@ rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION})

 set(MIGRAPHX_ENABLE_ZENDNN Off CACHE BOOL "")

-find_package(Threads)
-
 if(MIGRAPHX_ENABLE_ZENDNN)
    find_path(ZENDNN_INC_PATH zendnn.hpp)
    find_library(ZENDNN_LIB amdZenDNN)
@@ -53,7 +51,7 @@ if(MIGRAPHX_ENABLE_ZENDNN)
 else()
    target_link_libraries(migraphx_cpu PRIVATE DNNL::dnnl)
 endif()
-target_link_libraries(migraphx_cpu PRIVATE migraphx Threads::Threads)
+target_link_libraries(migraphx_cpu PRIVATE migraphx)

 find_package(OpenMP)
 target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)

--- a/src/targets/cpu/dnnl.cpp
+++ b/src/targets/cpu/dnnl.cpp
@@ -73,7 +73,7 @@ dnnl::memory::desc to_dnnl_memory_desc(const shape& s)

 dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
 {
-    return dnnl::memory(desc, get_dnnl_context().engine, a.data());
+    return {desc, get_dnnl_context().engine, a.data()};
 }

 dnnl::memory to_dnnl_memory(const argument& a)

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -3,7 +3,6 @@
 #include <migraphx/check_context.hpp>
 #include <migraphx/adjust_allocation.hpp>
 #include <migraphx/dead_code_elimination.hpp>
-#include <migraphx/decompose.hpp>
 #include <migraphx/eliminate_allocation.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
 #include <migraphx/eliminate_concat.hpp>
@@ -14,7 +13,6 @@
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
-#include <migraphx/remap.hpp>
 #include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
@@ -52,8 +50,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            dead_code_elimination{},
            eliminate_data_type{unsupported_types, shape::type_t::float_type},
            dead_code_elimination{},
-            decompose{},
-            dead_code_elimination{},
            simplify_reshapes{},
            eliminate_identity{},
            eliminate_pad{},

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -60,6 +60,7 @@ add_library(migraphx_device
    device/mul_add.cpp
    device/mul_add_relu.cpp
    device/multinomial.cpp
+    device/nonzero.cpp
    device/pad.cpp
    device/pow.cpp
    device/prelu.cpp
@@ -124,6 +125,7 @@ add_library(migraphx_gpu
    compile_hip.cpp
    compile_hip_code_object.cpp
    compile_pointwise.cpp
+    compile_roialign.cpp
    concat.cpp
    convert.cpp
    convolution.cpp
@@ -145,6 +147,7 @@ add_library(migraphx_gpu
    leaky_relu.cpp
    mlir_conv.cpp
    multinomial.cpp
+    nonzero.cpp
    pack_args.cpp
    pack_int8_args.cpp
    pad.cpp
@@ -202,6 +205,7 @@ register_migraphx_gpu_ops(hip_
    min
    mul
    multinomial
+    nonzero
    pad
    pow
    prelu

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/ranges.hpp>
 #include <migraphx/env.hpp>
 #include <cassert>
 #include <iostream>
@@ -230,6 +231,20 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    return {compiler.compile(srcs)};
 }

+std::string enum_params(std::size_t count, std::string param)
+{
+    std::vector<std::string> items(count);
+    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
+    return join_strings(items, ",");
+}
+
+std::size_t compute_global(std::size_t n, std::size_t local)
+{
+    std::size_t groups  = (n + local - 1) / local;
+    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
+    return nglobal;
+}
+
 #endif // MIGRAPHX_USE_HIPRTC

 } // namespace gpu

--- a/src/targets/gpu/compile_pointwise.cpp
+++ b/src/targets/gpu/compile_pointwise.cpp
 #include <migraphx/gpu/compile_pointwise.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
@@ -28,20 +29,6 @@ int main() {}

 )__migraphx__";

-std::string enum_params(std::size_t count, std::string param)
-{
-    std::vector<std::string> items(count);
-    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
-    return join_strings(items, ",");
-}
-
-std::size_t compute_global(std::size_t n, std::size_t local = 1024)
-{
-    std::size_t groups  = (n + local - 1) / local;
-    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
-    return nglobal;
-}
-
 operation compile_pointwise(context&, const std::vector<shape>& inputs, const std::string& lambda)
 {
    hip_compile_options options;

--- a/src/targets/gpu/compile_roialign.cpp
+++ b/src/targets/gpu/compile_roialign.cpp
+#include <migraphx/gpu/compile_roialign.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const roialign_kernel = R"__migraphx__(
+#include <migraphx/kernels/roialign.hpp>
+#include <migraphx/kernels/basic_ops.hpp>
+#include <args.hpp>
+
+using namespace migraphx;
+
+extern "C" {
+__global__ void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y) 
+{
+    make_tensors()(in_x, in_rois, in_ind, y)([](auto&&... xs) { roialign(xs...); });
+}
+}
+
+int main() {}
+
+)__migraphx__";
+
+operation compile_roialign(context&, const std::vector<shape>& io_shapes, const value& val)
+{
+    hip_compile_options options;
+    auto out_s             = io_shapes.back();
+    options.local          = 128;
+    options.global         = compute_global(out_s.elements(), options.local);
+    options.inputs         = io_shapes;
+    options.output         = out_s;
+    options.kernel_name    = "roialign_kernel";
+    options.reduced_inputs = io_shapes;
+
+    // sampling_ratio
+    assert(val.contains("sampling_ratio"));
+    auto sampling_ratio = val.at("sampling_ratio").to<int64_t>();
+    options.params += " -DSAMPLING_RATIO=" + std::to_string(sampling_ratio);
+
+    // pooling_mode
+    assert(val.contains("mode"));
+    auto mode           = val.at("mode").to<std::string>();
+    bool is_avg_pooling = (mode == "avg");
+    options.params += " -DIS_AVG_POOLING=" + std::to_string(static_cast<int>(is_avg_pooling));
+
+    // coord_trans_mode
+    assert(val.contains("coordinate_transformation_mode"));
+    auto ctm          = val.at("coordinate_transformation_mode").to<std::string>();
+    float rois_offset = (ctm == "output_half_pixel") ? -0.5f : 0.0f;
+    options.params += " -DROIS_OFFSET=" + std::to_string(rois_offset);
+
+    // spatial_scale
+    assert(val.contains("spatial_scale"));
+    float spatial_scale = val.at("spatial_scale").to<float>();
+    options.params += " -DSPATIAL_SCALE=" + std::to_string(spatial_scale);
+
+    return compile_hip_code_object(roialign_kernel, options);
+}
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx