merge changes from develop branch and resolve merge conflicts

712f6134 · Shucai Xiao · 4a39a0f7 · b20e3d4d · 712f6134 · 712f6134
Commit 712f6134 authored Feb 02, 2022 by Shucai Xiao
20 changed files
--- a/src/onnx/parse_randomnormal_ops.cpp
+++ b/src/onnx/parse_randomnormal_ops.cpp
@@ -42,11 +42,6 @@ struct parse_randomnormal_ops : op_parser<parse_randomnormal_ops>
        if(contains(info.attributes, "scale"))
            scale = info.attributes.at("scale").f();

-        float seed = static_cast<float>(
-            std::chrono::high_resolution_clock::now().time_since_epoch().count());
-        if(contains(info.attributes, "seed"))
-            seed = info.attributes.at("seed").f();
-
        shape out_shape;
        if(contains(info.attributes, "shape"))
        {
@@ -75,7 +70,10 @@ struct parse_randomnormal_ops : op_parser<parse_randomnormal_ops>
                           ": cannot deduce shape without shape attribute or argument.");
        }

-        std::mt19937 gen(seed);
+        std::mt19937 gen(std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        if(contains(info.attributes, "seed"))
+            gen.seed(info.attributes.at("seed").f());
+
        std::normal_distribution<> d(mean, scale);
        std::vector<double> rand_vals(out_shape.elements());
        std::generate(rand_vals.begin(), rand_vals.end(), [&]() { return d(gen); });

--- a/src/onnx/parse_randomuniform_ops.cpp
+++ b/src/onnx/parse_randomuniform_ops.cpp
@@ -42,11 +42,6 @@ struct parse_randomuniform_ops : op_parser<parse_randomuniform_ops>
        if(contains(info.attributes, "low"))
            low = info.attributes.at("low").f();

-        float seed = static_cast<float>(
-            std::chrono::high_resolution_clock::now().time_since_epoch().count());
-        if(contains(info.attributes, "seed"))
-            seed = info.attributes.at("seed").f();
-
        shape out_shape;
        if(contains(info.attributes, "shape"))
        {
@@ -75,7 +70,10 @@ struct parse_randomuniform_ops : op_parser<parse_randomuniform_ops>
                           ": cannot deduce shape without shape attribute or argument.");
        }

-        std::mt19937 gen(seed);
+        std::mt19937 gen(std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        if(contains(info.attributes, "seed"))
+            gen.seed(info.attributes.at("seed").f());
+
        std::uniform_real_distribution<> d(high, low);
        std::vector<double> rand_vals(out_shape.elements());
        std::generate(rand_vals.begin(), rand_vals.end(), [&]() { return d(gen); });

--- a/src/onnx/parse_resize.cpp
+++ b/src/onnx/parse_resize.cpp
@@ -163,9 +163,9 @@ static std::string get_nearest_mode(const onnx_parser::attribute_map& attr)

 struct parse_resize : op_parser<parse_resize>
 {
-    std::vector<op_desc> operators() const { return {{"Resize"}}; }
+    std::vector<op_desc> operators() const { return {{"Resize"}, {"Upsample"}}; }

-    instruction_ref parse(const op_desc& /*opd*/,
+    instruction_ref parse(const op_desc& opd,
                          const onnx_parser& /*parser*/,
                          onnx_parser::node_info info,
                          std::vector<instruction_ref> args) const
@@ -183,7 +183,7 @@ struct parse_resize : op_parser<parse_resize>
        if(contains(info.attributes, "exclude_outside") and
           info.attributes.at("exclude_outside").i() == 1)
        {
-            MIGRAPHX_THROW("PARSE_RESIZE: exclude_outside 1 is not supported!");
+            MIGRAPHX_THROW("PARSE_" + opd.op_name + ": exclude_outside 1 is not supported!");
        }

        // input data shape info
@@ -215,12 +215,14 @@ struct parse_resize : op_parser<parse_resize>
            if(type == shape::int64_type)
            {
                auto arg_out_s = arg->eval();
-                check_arg_empty(arg_out_s, "PARSE_RESIZE: dynamic output size is not supported!");
+                check_arg_empty(arg_out_s,
+                                "PARSE_" + opd.op_name + ": dynamic output size is not supported!");
                arg_out_s.visit([&](auto ol) { out_lens.assign(ol.begin(), ol.end()); });

                if(out_lens.size() != in_lens.size())
                {
-                    MIGRAPHX_THROW("PARSE_RESIZE: specified output size does not match input size");
+                    MIGRAPHX_THROW("PARSE_" + opd.op_name +
+                                   ": specified output size does not match input size");
                }

                // compute the scale
@@ -239,12 +241,14 @@ struct parse_resize : op_parser<parse_resize>
                {
                    auto arg_scale = arg->eval();
                    check_arg_empty(arg_scale,
-                                    "PARSE_RESIZE: dynamic input scale is not supported!");
+                                    "PARSE_" + opd.op_name +
+                                        ": dynamic input scale is not supported!");

                    arg_scale.visit([&](auto v) { vec_scale.assign(v.begin(), v.end()); });
                    if(in_lens.size() != vec_scale.size())
                    {
-                        MIGRAPHX_THROW("PARSE_RESIZE: ranks of input and scale are different!");
+                        MIGRAPHX_THROW("PARSE_" + opd.op_name +
+                                       ": ranks of input and scale are different!");
                    }

                    std::transform(in_lens.begin(),
@@ -334,7 +338,7 @@ struct parse_resize : op_parser<parse_resize>
                auto ins_delta = info.add_literal(dim_s, delta_data);

                // slice the data
-                int64_t slc_stride = static_cast<int64_t>(dim_lens[0]);
+                int64_t slc_stride = dim_lens[0];
                auto low           = info.add_instruction(
                    make_op("slice", {{"axes", {0}}, {"starts", {0}}, {"ends", {slc_stride}}}),
                    data);

--- a/src/onnx/parse_roialign.cpp
+++ b/src/onnx/parse_roialign.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_roialign : op_parser<parse_roialign>
+{
+    std::vector<op_desc> operators() const { return {{"RoiAlign"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          onnx_parser::node_info info,
+                          const std::vector<instruction_ref>& args) const
+    {
+        std::string coord_trans_mode = "half_pixel";
+        if(contains(info.attributes, "coordinate_transformation_mode"))
+        {
+            coord_trans_mode = info.attributes.at("coordinate_transformation_mode").s();
+        }
+        if(not contains({"half_pixel", "output_half_pixel"}, coord_trans_mode))
+        {
+            MIGRAPHX_THROW("coordinate_transformation_mode \"" + coord_trans_mode +
+                           "\": invalid value!");
+        }
+
+        std::string mode = "avg";
+        if(contains(info.attributes, "mode"))
+        {
+            mode = info.attributes.at("mode").s();
+        }
+
+        int64_t output_height = 1;
+        if(contains(info.attributes, "output_height"))
+        {
+            output_height = info.attributes.at("output_height").i();
+        }
+
+        int64_t output_width = 1;
+        if(contains(info.attributes, "output_width"))
+        {
+            output_width = info.attributes.at("output_width").i();
+        }
+
+        int64_t sampling_ratio = 0;
+        if(contains(info.attributes, "sampling_ratio"))
+        {
+            sampling_ratio = info.attributes.at("sampling_ratio").i();
+        }
+
+        float spatial_scale = 1.0f;
+        if(contains(info.attributes, "spatial_scale"))
+        {
+            spatial_scale = info.attributes.at("spatial_scale").f();
+        }
+
+        return info.add_instruction(make_op("roialign",
+                                            {{"coordinate_transformation_mode", coord_trans_mode},
+                                             {"mode", mode},
+                                             {"output_height", output_height},
+                                             {"output_width", output_width},
+                                             {"sampling_ratio", sampling_ratio},
+                                             {"spatial_scale", spatial_scale}}),
+                                    args);
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_softplus.cpp
+++ b/src/onnx/parse_softplus.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_softplus : op_parser<parse_softplus>
+{
+    std::vector<op_desc> operators() const { return {{"Softplus"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        // Apply pointwise formula: y = ln(exp(x) + 1)
+        auto mb_ones = info.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", args[0]->get_shape().lens()}}),
+            info.add_literal(migraphx::literal{migraphx::shape{args[0]->get_shape().type()}, {1}}));
+        auto exp = info.add_instruction(migraphx::make_op("exp"), args[0]);
+        auto add = info.add_instruction(migraphx::make_op("add"), exp, mb_ones);
+        return info.add_instruction(migraphx::make_op("log"), add);
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_softsign.cpp
+++ b/src/onnx/parse_softsign.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_softsign : op_parser<parse_softsign>
+{
+    std::vector<op_desc> operators() const { return {{"Softsign"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        // Apply pointwise formula: y = x / (1 + |x|)
+        auto mb_ones = info.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", args[0]->get_shape().lens()}}),
+            info.add_literal(migraphx::literal{migraphx::shape{args[0]->get_shape().type()}, {1}}));
+        auto abs = info.add_instruction(migraphx::make_op("abs"), args[0]);
+        auto add = info.add_instruction(migraphx::make_op("add"), abs, mb_ones);
+        return info.add_instruction(migraphx::make_op("div"), args[0], add);
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_spacetodepth.cpp
+++ b/src/onnx/parse_spacetodepth.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_spacetodepth : op_parser<parse_spacetodepth>
+{
+    std::vector<op_desc> operators() const { return {{"SpaceToDepth"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto s = args[0]->get_shape();
+        // blocksize attribute of SpaceToDepth
+        int blocksize = 1; // if blockSize of 1 then, this is a no-op
+        if(contains(info.attributes, "blocksize"))
+        {
+            blocksize = info.attributes.at("blocksize").i();
+        }
+        if(blocksize < 1)
+        {
+            // blockSize less than 1 would rather result in DepthToSpace instead of SpaceToDepth
+            MIGRAPHX_THROW("SpaceToDepth: blocksize is less than 1");
+        }
+        // calculate dimensions
+        auto res_lens = s.lens(); // {N, C, H, W}
+        if(((res_lens[2] % blocksize) == 0) and ((res_lens[3] % blocksize) == 0))
+        {
+            // Co = C * (blocksize ^ 2)
+            res_lens[1] = res_lens[1] * blocksize * blocksize;
+            // Ho = (H / blocksize)
+            res_lens[2] = res_lens[2] / blocksize;
+            // Wo = (W / blocksize)
+            res_lens[3] = res_lens[3] / blocksize;
+        } // res_shape = (N, Co, Ho, Wo)
+        else
+            MIGRAPHX_THROW("SpaceToDepth: div by blocksize quotient not int ");
+
+        auto trans_lens = s.lens(); // {N, C, H, W}
+        trans_lens[2]   = res_lens[2];
+        trans_lens[3]   = blocksize;
+        trans_lens.push_back(res_lens[3]);
+        trans_lens.push_back(blocksize); // {N, C, Ho, blocksize, Wo, blocksize}
+        std::vector<int64_t> perm = {0, 3, 5, 1, 2, 4};
+        auto temp1 = info.add_instruction(make_op("reshape", {{"dims", trans_lens}}), args[0]);
+        auto temp2 = info.add_instruction(make_op("transpose", {{"permutation", perm}}), temp1);
+        return info.add_instruction(make_op("reshape", {{"dims", res_lens}}),
+                                    info.make_contiguous(temp2));
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_split.cpp
+++ b/src/onnx/parse_split.cpp
@@ -24,7 +24,7 @@ struct parse_split : op_parser<parse_split>
        }

        auto lens          = args[0]->get_shape().lens();
-        int64_t n_rank     = static_cast<int64_t>(lens.size());
+        int64_t n_rank     = lens.size();
        int64_t tuned_axis = tune_axis(n_rank, axis, opd.op_name);

        std::vector<int64_t> vec_splits;

--- a/src/onnx/parse_upsample.cpp
+++ b/src/onnx/parse_upsample.cpp
-#include <migraphx/onnx/op_parser.hpp>
-#include <migraphx/onnx/checks.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/make_op.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace onnx {
-
-struct parse_upsample : op_parser<parse_upsample>
-{
-    std::vector<op_desc> operators() const { return {{"Upsample"}}; }
-
-    instruction_ref parse(const op_desc& /*opd*/,
-                          const onnx_parser& /*parser*/,
-                          onnx_parser::node_info info,
-                          std::vector<instruction_ref> args) const
-    {
-        if(contains(info.attributes, "mode"))
-        {
-            auto mode = info.attributes.at("mode").s();
-            if(mode != "nearest")
-            {
-                MIGRAPHX_THROW("PARSE_UPSAMPLE: only nearest mode is supported!");
-            }
-        }
-
-        auto arg_scale = args[1]->eval();
-        check_arg_empty(arg_scale, "PARSE_UPSAMPLE: only constant scale is supported!");
-        std::vector<float> vec_scale;
-        arg_scale.visit([&](auto v) { vec_scale.assign(v.begin(), v.end()); });
-
-        auto in_s    = args[0]->get_shape();
-        auto in_lens = in_s.lens();
-        if(in_lens.size() != vec_scale.size())
-        {
-            MIGRAPHX_THROW("PARSE_UPSAMPLE: ranks of input and scale are different!");
-        }
-
-        std::vector<std::size_t> out_lens(in_lens.size());
-        std::transform(in_lens.begin(),
-                       in_lens.end(),
-                       vec_scale.begin(),
-                       out_lens.begin(),
-                       [&](auto idx, auto scale) { return static_cast<std::size_t>(idx * scale); });
-
-        std::vector<float> idx_scale(in_lens.size());
-        std::transform(
-            out_lens.begin(),
-            out_lens.end(),
-            in_lens.begin(),
-            idx_scale.begin(),
-            [](auto od, auto id) { return (od == id) ? 1.0f : (id - 1.0f) / (od - 1.0f); });
-
-        shape out_s{in_s.type(), out_lens};
-        std::vector<int> ind(out_s.elements());
-
-        // map out_idx to in_idx
-        shape_for_each(out_s, [&](auto idx) {
-            auto in_idx = idx;
-            std::transform(idx.begin(),
-                           idx.end(),
-                           idx_scale.begin(),
-                           in_idx.begin(),
-                           // nearest mode
-                           [](auto index, auto scale) {
-                               return static_cast<std::size_t>(std::round(index * scale));
-                           });
-
-            ind[out_s.index(idx)] = static_cast<int64_t>(in_s.index(in_idx));
-        });
-
-        // reshape input to one-dimension
-        std::vector<int64_t> rsp_lens = {static_cast<int64_t>(in_s.elements())};
-        shape ind_s{shape::int32_type, out_lens};
-        auto rsp     = info.add_instruction(make_op("reshape", {{"dims", rsp_lens}}), args[0]);
-        auto ins_ind = info.add_literal(literal(ind_s, ind));
-        return info.add_instruction(make_op("gather", {{"axis", 0}}), rsp, ins_ind);
-    }
-};
-
-} // namespace onnx
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/pass_manager.cpp
+++ b/src/pass_manager.cpp
@@ -15,6 +15,8 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_PASSES);
+
 void validate_pass(module& mod, const pass& p, tracer trace)
 {
    (void)mod;
@@ -82,6 +84,8 @@ module& get_module(module_pass_manager& mpm) { return mpm.get_module(); }

 void run_passes(module& mod, const std::vector<pass>& passes, tracer trace)
 {
+    if(enabled(MIGRAPHX_TRACE_PASSES{}))
+        trace = tracer{std::cout};
    for(const auto& p : passes)
    {
        module_pm{&mod, nullptr, &trace}.run_pass(p);
@@ -90,6 +94,8 @@ void run_passes(module& mod, const std::vector<pass>& passes, tracer trace)

 void run_passes(program& prog, const std::vector<pass>& passes, tracer trace)
 {
+    if(enabled(MIGRAPHX_TRACE_PASSES{}))
+        trace = tracer{std::cout};
    for(const auto& p : passes)
    {
        auto mods = prog.get_modules();

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -13,6 +13,7 @@
 #include <migraphx/algorithm.hpp>
 #include <migraphx/output_iterator.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/marker.hpp>
 #include <iostream>
 #include <sstream>
 #include <algorithm>
@@ -179,6 +180,63 @@ void program::finalize()
    mm->finalize(this->impl->ctx);
 }

+template <class T>
+std::string classify(T x)
+{
+    switch(std::fpclassify(x))
+    {
+    case FP_INFINITE: return "inf";
+    case FP_NAN: return "nan";
+    case FP_NORMAL: return "normal";
+    case FP_SUBNORMAL: return "subnormal";
+    case FP_ZERO: return "zero";
+    default: return "unknown";
+    }
+}
+
+std::unordered_set<std::string> classify_argument(const argument& a)
+{
+    std::unordered_set<std::string> result;
+    a.visit(
+        [&](auto t) {
+            for(const auto& x : t)
+                result.insert(classify(x));
+        },
+        [&](const auto& xs) {
+            for(const auto& x : xs)
+            {
+                auto r = classify_argument(x);
+                result.insert(r.begin(), r.end());
+            }
+        });
+    return result;
+}
+
+void preview_argument(std::ostream& os, const argument& a)
+{
+    a.visit(
+        [&](auto t) {
+            if(t.size() <= 10)
+            {
+                os << t;
+            }
+            else
+            {
+                os << to_string_range(t.begin(), t.begin() + 5);
+                os << ", ..., ";
+                os << to_string_range(t.end() - 5, t.end());
+            }
+        },
+        [&](const auto& xs) {
+            for(const auto& x : xs)
+            {
+                os << '{';
+                preview_argument(os, x);
+                os << '}';
+            }
+        });
+}
+
 template <class F>
 std::vector<argument> generic_eval(const module* mod,
                                   context& ctx,
@@ -309,8 +367,24 @@ std::vector<argument> program::eval(parameter_map params) const
                                double t2 = t.record<milliseconds>();
                                std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
                                if(trace_level > 1 and ins->name().front() != '@' and
-                                   ins->name() != "load")
-                                    std::cout << "Output: " << result << std::endl;
+                                   ins->name() != "load" and not result.empty())
+                                {
+                                    target tgt  = make_target(this->impl->target_name);
+                                    auto buffer = tgt.copy_from(result);
+                                    if(trace_level == 2)
+                                    {
+                                        std::cout << "Output has "
+                                                  << to_string_range(classify_argument(buffer))
+                                                  << std::endl;
+                                        std::cout << "Output: ";
+                                        preview_argument(std::cout, buffer);
+                                        std::cout << std::endl;
+                                    }
+                                    else
+                                    {
+                                        std::cout << "Output: " << buffer << std::endl;
+                                    }
+                                }
                                return result;
                            }));
    }
@@ -504,7 +578,28 @@ std::string perf_group(const operation& op)
    return op.name();
 }

-void program::perf_report(std::ostream& os, std::size_t n, parameter_map params) const
+void program::mark(const parameter_map& params, marker&& m)
+{
+    auto& ctx = this->impl->ctx;
+    // Run once by itself
+    eval(params);
+    ctx.finish();
+    // Start marking
+    m.mark_start(*this);
+    generic_eval(*this, ctx, params, always([&](auto ins, auto f) {
+        argument result;
+        m.mark_start(ins);
+        result = f();
+        m.mark_stop(ins);
+        return result;
+    }));
+    m.mark_stop(*this);
+}
+
+void program::perf_report(std::ostream& os,
+                          std::size_t n,
+                          parameter_map params,
+                          std::size_t batch) const
 {
    auto& ctx = this->impl->ctx;
    // Run once by itself
@@ -597,7 +692,8 @@ void program::perf_report(std::ostream& os, std::size_t n, parameter_map params)

    os << std::endl;

-    os << "Rate: " << rate << "/sec" << std::endl;
+    os << "Batch size: " << batch << std::endl;
+    os << "Rate: " << rate * batch << "/sec" << std::endl;
    os << "Total time: " << total_time << "ms" << std::endl;
    os << "Total instructions time: " << total_instruction_time << "ms" << std::endl;
    os << "Overhead time: " << overhead_time << "ms"

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -269,7 +269,7 @@ std::vector<instruction_ref> rewrite_rnn::vanilla_rnn_cell(bool is_forward,
    instruction_ref hidden_out = prog.end();
    instruction_ref last_out{};
    last_out     = prog.insert_instruction(ins, make_op("unsqueeze", {{"axes", {0, 1}}}), sih);
-    long seq_len = static_cast<long>(get_seq_len(prog, seq, seq_lens));
+    long seq_len = get_seq_len(prog, seq, seq_lens);
    for(long i = 0; i < seq_len; i++)
    {
        long seq_index = is_forward ? i : (seq_len - 1 - i);
@@ -556,7 +556,7 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
    instruction_ref last_output{};
    migraphx::shape seq_shape = seq->get_shape();
    migraphx::shape r_shape   = r->get_shape();
-    long hs                   = static_cast<long>(r_shape.lens()[2]);
+    long hs                   = r_shape.lens()[2];

    migraphx::shape ss(seq_shape.type(), {seq_shape.lens()[1], r_shape.lens()[2]});
    std::vector<float> data(ss.elements(), 1.0f);
@@ -613,7 +613,7 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
            rb_h);
    }

-    long seq_len = static_cast<long>(get_seq_len(prog, seq, seq_lens));
+    long seq_len = get_seq_len(prog, seq, seq_lens);
    for(long i = 0; i < seq_len; i++)
    {
        long seq_index = is_forward ? i : (seq_len - 1 - i);
@@ -1032,7 +1032,7 @@ std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
    instruction_ref last_cell_output{};

    migraphx::shape r_shape = r->get_shape();
-    long hs                 = static_cast<long>(r_shape.lens()[2]);
+    long hs                 = r_shape.lens()[2];
    auto bs                 = ih->get_shape().lens()[1];

    std::vector<int64_t> perm{1, 0};
@@ -1094,7 +1094,7 @@ std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
            ins, make_op("broadcast", {{"axis", 1}, {"out_lens", ic_lens}}), pphf);
    }

-    long seq_len = static_cast<long>(get_seq_len(prog, seq, seq_lens));
+    long seq_len = get_seq_len(prog, seq, seq_lens);
    for(long i = 0; i < seq_len; ++i)
    {
        long seq_index = is_forward ? i : (seq_len - 1 - i);

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -539,6 +539,46 @@ struct find_reshape_cont
    }
 };

+// match sequence of transpose --> contiguous --> reshaper_op
+auto match_transpose_contiguous_reshaper()
+{
+    return match::name({"reshape", "squeeze", "unsqueeze"})(
+               match::used_once(),
+               match::args(
+                   match::name("contiguous")(
+                       match::used_once(), match::args(match::transpose_shape().bind("trans_ins")))
+                       .bind("cont_ins")))
+        .bind("reshaper_ins");
+};
+
+// finds the pattern of transpose --> contiguous --> reshaper_op --> unary
+// application of this matcher moves the unary operation before the contiguous so it becomes
+// transpose --> unary --> contiguous --> reshaper_op. later pointwise sub-module can be created out
+// of unary --> contiguous --> reshaper_op. Such pattern appears in depthToSpace or spaceToDepth
+// operator.
+struct find_transpose_contiguous_reshaper_unary
+{
+    auto matcher() const
+    {
+        return pointwise(match::used_once(),
+                         match::nargs(1),
+                         match::args(match_transpose_contiguous_reshaper()));
+    }
+
+    void apply(module& p, match::matcher_result r) const
+    {
+        auto ins           = r.result;
+        auto reshaper_ins  = r.instructions["reshaper_ins"];
+        auto trans_ins     = r.instructions["trans_ins"];
+        auto cont_ins      = r.instructions["cont_ins"];
+        auto unary_op_name = ins->get_operator().name();
+        auto unary_ins     = p.insert_instruction(cont_ins, make_op(unary_op_name), trans_ins);
+        auto new_cont_ins  = p.insert_instruction(cont_ins, make_op("contiguous"), unary_ins);
+        // older cont and reshape are removed by deadcode elimination
+        p.replace_instruction(ins, reshaper_ins->get_operator(), new_cont_ins);
+    }
+};
+
 void simplify_reshapes::apply(module& p) const
 {
    for(int i = 0; i < 2; i++)
@@ -553,7 +593,8 @@ void simplify_reshapes::apply(module& p) const
                            find_concat_transpose{},
                            find_nested_convert{},
                            find_nested_slice{},
-                            find_nested_concat{});
+                            find_nested_concat{},
+                            find_transpose_contiguous_reshaper_unary{});
        dead_code_elimination{}.apply(p);
    }
 }

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -33,8 +33,6 @@ rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION})

 set(MIGRAPHX_ENABLE_ZENDNN Off CACHE BOOL "")

-find_package(Threads)
-
 if(MIGRAPHX_ENABLE_ZENDNN)
    find_path(ZENDNN_INC_PATH zendnn.hpp)
    find_library(ZENDNN_LIB amdZenDNN)
@@ -53,7 +51,7 @@ if(MIGRAPHX_ENABLE_ZENDNN)
 else()
    target_link_libraries(migraphx_cpu PRIVATE DNNL::dnnl)
 endif()
-target_link_libraries(migraphx_cpu PRIVATE migraphx Threads::Threads)
+target_link_libraries(migraphx_cpu PRIVATE migraphx)

 find_package(OpenMP)
 target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)

--- a/src/targets/cpu/dnnl.cpp
+++ b/src/targets/cpu/dnnl.cpp
@@ -73,7 +73,7 @@ dnnl::memory::desc to_dnnl_memory_desc(const shape& s)

 dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
 {
-    return dnnl::memory(desc, get_dnnl_context().engine, a.data());
+    return {desc, get_dnnl_context().engine, a.data()};
 }

 dnnl::memory to_dnnl_memory(const argument& a)

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -91,28 +91,34 @@ add_library(migraphx_device
    device/unary_not.cpp
    device/where.cpp
 )
-set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
-rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
-rocm_clang_tidy_check(migraphx_device)
-target_compile_options(migraphx_device PRIVATE -std=c++17 -fno-gpu-rdc -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
-target_link_libraries(migraphx_device migraphx hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument)
-if(CMAKE_CXX_COMPILER MATCHES ".*hcc")
-    set(AMDGPU_TARGETS "gfx803;gfx900;gfx906" CACHE STRING "")
-    foreach(AMDGPU_TARGET ${AMDGPU_TARGETS})
-        target_compile_options(migraphx_device PRIVATE -amdgpu-target=${AMDGPU_TARGET})
-        target_link_libraries(migraphx_device -amdgpu-target=${AMDGPU_TARGET})
-    endforeach()
-else()
-    target_compile_options(migraphx_device PRIVATE -Wno-cuda-compat)
-endif()
+add_library(compile_for_gpu INTERFACE)
+target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
+target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument)
 check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
 if(HAS_HIP_LAMBDA_HOST_DEVICE)
  message(STATUS "Enable -fhip-lambda-host-device")
-  target_compile_options(migraphx_device PRIVATE -fhip-lambda-host-device)
+  target_compile_options(compile_for_gpu INTERFACE -fhip-lambda-host-device)
 endif()
+
+set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
+rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
+rocm_clang_tidy_check(migraphx_device)
+target_link_libraries(migraphx_device PUBLIC migraphx)
+target_link_libraries(migraphx_device PRIVATE compile_for_gpu)
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)

+add_library(kernel_file_check EXCLUDE_FROM_ALL)
+foreach(KERNEL_FILE ${KERNEL_FILES})
+    get_filename_component(KERNEL_BASE_FILE ${KERNEL_FILE} NAME_WE)
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp "#include <migraphx/kernels/${KERNEL_BASE_FILE}.hpp>\n")
+    target_sources(kernel_file_check PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels/include/migraphx/kernels/${KERNEL_BASE_FILE}.cpp)
+endforeach()
+target_include_directories(kernel_file_check PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/kernels/include/>)
+target_link_libraries(kernel_file_check compile_for_gpu)
+
+rocm_clang_tidy_check(kernel_file_check)
+
 add_library(migraphx_gpu
    abs.cpp
    analyze_streams.cpp
@@ -122,9 +128,11 @@ add_library(migraphx_gpu
    batch_norm_inference.cpp
    clip.cpp
    code_object_op.cpp
+    compile_ops.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
    compile_pointwise.cpp
+    compile_roialign.cpp
    concat.cpp
    convert.cpp
    convolution.cpp
@@ -308,8 +316,12 @@ target_flags(HIP_COMPILER_FLAGS hip::device)
 # Remove cuda arch flags
 string(REGEX REPLACE --cuda-gpu-arch=[a-z0-9]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
 string(REGEX REPLACE --offload-arch=[a-z0-9:+-]+ "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REPLACE "$<LINK_LANGUAGE:CXX>" "1" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
-string(REPLACE "SHELL:" "" HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+# Skip library paths since hip will incorrectly treat it as a source file
+string(APPEND HIP_COMPILER_FLAGS " ")
+foreach(_unused RANGE 2)
+    string(REGEX REPLACE " /[^ ]+\\.(a|so) " " " HIP_COMPILER_FLAGS "${HIP_COMPILER_FLAGS}")
+endforeach()
+
 message(STATUS "Hip compiler flags: ${HIP_COMPILER_FLAGS}")
 target_compile_definitions(migraphx_gpu PRIVATE 
    "-DMIGRAPHX_HIP_COMPILER=${CMAKE_CXX_COMPILER}" 
@@ -339,7 +351,7 @@ target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
 add_subdirectory(driver)

 rocm_install_targets(
-  TARGETS migraphx_gpu migraphx_device
+  TARGETS migraphx_gpu migraphx_device compile_for_gpu
  INCLUDE
    ${CMAKE_CURRENT_SOURCE_DIR}/include
 )

--- a/src/targets/gpu/allocation_model.cpp
+++ b/src/targets/gpu/allocation_model.cpp
 #include <migraphx/gpu/allocation_model.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/module.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/argmax.cpp
+++ b/src/targets/gpu/argmax.cpp
@@ -9,7 +9,7 @@ namespace gpu {

 shape hip_argmax::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(2).standard();
+    check_shapes{inputs, *this}.has(2);
    return op.normalize_compute_shape({inputs.at(0)});
 }


--- a/src/targets/gpu/argmin.cpp
+++ b/src/targets/gpu/argmin.cpp
@@ -9,7 +9,7 @@ namespace gpu {

 shape hip_argmin::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(2).standard();
+    check_shapes{inputs, *this}.has(2);
    return op.normalize_compute_shape({inputs.at(0)});
 }


--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/ranges.hpp>
 #include <migraphx/env.hpp>
 #include <cassert>
 #include <iostream>
@@ -230,6 +231,20 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    return {compiler.compile(srcs)};
 }

+std::string enum_params(std::size_t count, std::string param)
+{
+    std::vector<std::string> items(count);
+    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
+    return join_strings(items, ",");
+}
+
+std::size_t compute_global(std::size_t n, std::size_t local)
+{
+    std::size_t groups  = (n + local - 1) / local;
+    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
+    return nglobal;
+}
+
 #endif // MIGRAPHX_USE_HIPRTC

 } // namespace gpu