merge changes from develop branch and resolve merge conflicts

712f6134 · Shucai Xiao · 4a39a0f7 · b20e3d4d · 712f6134 · 712f6134
Commit 712f6134 authored Feb 02, 2022 by Shucai Xiao
20 changed files
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+
+#include <limits>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <cmath>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct roialign
+{
+    std::string coord_trans_mode = "half_pixel";
+    std::string mode             = "avg";
+    int64_t output_height        = 1;
+    int64_t output_width         = 1;
+    int64_t sampling_ratio       = 0;
+    float spatial_scale          = 1.0f;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.coord_trans_mode, "coordinate_transformation_mode"),
+                    f(self.mode, "mode"),
+                    f(self.output_height, "output_height"),
+                    f(self.output_width, "output_width"),
+                    f(self.sampling_ratio, "sampling_ratio"),
+                    f(self.spatial_scale, "spatial_scale"));
+    }
+
+    std::string name() const { return "roialign"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3).standard();
+        auto x_lens   = inputs.at(0).lens();
+        auto roi_lens = inputs.at(1).lens();
+        auto bi_lens  = inputs.at(2).lens();
+        auto type     = inputs.at(0).type();
+
+        // check input correct
+        if(bi_lens.size() != 1)
+        {
+            MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
+        }
+
+        if(roi_lens.size() != 2 or roi_lens.at(1) != 4)
+        {
+            MIGRAPHX_THROW(
+                "ROIALIGN: rois should be 2 dimensions, and the second dim should be 4!");
+        }
+
+        if(roi_lens.front() != bi_lens.front())
+        {
+            MIGRAPHX_THROW("ROIALIGN: rois and batch indices inputs should have the same number!");
+        }
+
+        std::vector<std::size_t> out_lens = x_lens;
+        out_lens[0]                       = roi_lens[0];
+        out_lens[2]                       = output_height;
+        out_lens[3]                       = output_width;
+
+        return {type, out_lens};
+    }
+
+    struct pos_weight
+    {
+        // neighbor indices for the bilinear interpolation
+        std::array<std::size_t, 4> pos = {0, 0, 0, 0};
+        // neighbor weights for the bilinear interpolation
+        std::array<float, 4> w = {0.0f, 0.0f, 0.0f, 0.0f};
+    };
+
+    auto calc_pos_weight(const std::array<std::size_t, 2>& dims,
+                         const shape& comp_s,
+                         const std::array<float, 2>& roi_start,
+                         const std::array<float, 2>& bin_size,
+                         const std::array<std::size_t, 2>& bin_grid_size) const
+    {
+        std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
+                                        output_width);
+        shape_for_each(comp_s, [&](auto idx) {
+            std::array<std::size_t, 2> p = {idx[0], idx[1]};
+            std::array<std::size_t, 2> i = {idx[2], idx[3]};
+            auto index                   = comp_s.index(idx);
+
+            std::array<float, 2> xy{};
+            std::array<int64_t, 2> low{};
+            std::array<int64_t, 2> high{};
+            for(auto ii : range(p.size()))
+            {
+                xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
+                         (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
+                xy[ii] = (coord_trans_mode == "output_half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+                if(xy[ii] < -1.0 or xy[ii] > dims[ii])
+                {
+                    results[index] = pos_weight{};
+                    return;
+                }
+
+                xy[ii]   = std::max(xy[ii], 0.0f);
+                low[ii]  = xy[ii];
+                high[ii] = low[ii] + 1;
+                if(low[ii] >= dims[ii] - 1)
+                {
+                    xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+                }
+            }
+
+            results[index].pos = {low[0] * dims[1] + low[1],
+                                  low[0] * dims[1] + high[1],
+                                  high[0] * dims[1] + low[1],
+                                  high[0] * dims[1] + high[1]};
+
+            float ly = xy[0] - low[0];
+            float lx = xy[1] - low[1];
+            float hy = 1.0f - ly;
+            float hx = 1.0f - lx;
+
+            // save weights and indeces
+            results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
+        });
+
+        return results;
+    }
+
+    struct max_pool
+    {
+        double init() { return std::numeric_limits<double>::lowest(); }
+
+        double operator()(double x, double y) { return std::max(x, y); }
+
+        double final(double x, std::size_t) { return (x); }
+    };
+
+    struct avg_pool
+    {
+        double init() { return 0.0; }
+
+        double operator()(double x, double y) { return x + y; }
+
+        double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
+    };
+
+    template <class T, class Op>
+    std::tuple<double, int64_t> calc_pooling(const T& data,
+                                             const std::array<std::size_t, 2>& bin_grid_size,
+                                             const std::vector<pos_weight>& pos_weights,
+                                             int64_t index,
+                                             Op op) const
+    {
+        double output_val   = op.init();
+        const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+        dfor(bin_grid_size[0], bin_grid_size[1])([&](auto, auto) {
+            const auto& pc = pos_weights[index];
+            std::array<double, 4> wv;
+            std::transform(
+                pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
+                    return *(data + pos) * w;
+                });
+            output_val = std::accumulate(wv.begin(), wv.end(), output_val, op);
+            index += 1;
+        });
+
+        output_val = op.final(output_val, count);
+
+        return {output_val, index};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        const auto& out_lens = output_shape.lens();
+        int64_t n_rois       = out_lens[0];
+        std::size_t channels = out_lens[1];
+        // output dims of height and width, in all 2-dim arrays, the first dim
+        // is for height and second dim is for width
+        std::array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+        const auto& x_lens                  = args.at(0).get_shape().lens();
+        // input dims of height and width
+        std::array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+        auto roi_s                         = args.at(1).get_shape();
+
+        visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
+            const auto* batch_indices = args.at(2).cast<int64_t>();
+            par_for(n_rois, [&](auto n) {
+                const auto bottom_data   = x.begin();
+                const auto roi_batch_ind = batch_indices[n];
+                // Do not using rounding; this implementation detail is critical
+                std::array<float, 2> roi_starts = {
+                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale)};
+                std::array<float, 2> roi_ends = {
+                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale)};
+
+                // Force malformed ROIs to be 1x1
+                std::array<float, 2> roi_size{};
+                std::array<float, 2> bin_size{};
+                std::array<std::size_t, 2> bin_grid_size{};
+
+                for(auto ii : range(roi_size.size()))
+                {
+                    roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+                    roi_size[ii] = std::max(roi_size[ii], 1.0f);
+
+                    bin_size[ii]      = roi_size[ii] / out_dims[ii];
+                    bin_grid_size[ii] = (sampling_ratio > 0)
+                                            ? sampling_ratio
+                                            : std::ceil(roi_size[ii] / out_dims[ii]);
+                }
+
+                // we want to precalculate indices and weights shared by all channels,
+                // this is the key point of optimization
+                std::vector<std::size_t> comp_lens = {
+                    out_dims[0], out_dims[1], bin_grid_size[0], bin_grid_size[1]};
+                shape comp_s{shape::float_type, comp_lens};
+                auto pre_calc =
+                    this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
+
+                std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
+                shape comp_s1{migraphx::shape::float_type, comp_lens1};
+                std::vector<int64_t> vec_index(channels, 0);
+                shape_for_each(comp_s1, [&](auto idx) {
+                    auto c  = idx[0];
+                    auto ph = idx[1];
+                    auto pw = idx[2];
+
+                    const auto offset_bottom_data =
+                        bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
+                                                           in_dims[0] * in_dims[1]);
+                    double output_val;
+                    std::tie(output_val, vec_index[c]) =
+                        (mode == "avg") ? this->calc_pooling(offset_bottom_data,
+                                                             bin_grid_size,
+                                                             pre_calc,
+                                                             vec_index[c],
+                                                             avg_pool{})
+                                        : this->calc_pooling(offset_bottom_data,
+                                                             bin_grid_size,
+                                                             pre_calc,
+                                                             vec_index[c],
+                                                             max_pool{});
+                    output(n, c, ph, pw) = output_val;
+                });
+
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/sigmoid.hpp
+++ b/src/include/migraphx/op/sigmoid.hpp
@@ -18,6 +18,7 @@ namespace op {

 struct sigmoid : unary<sigmoid>
 {
+    std::string point_op() const { return "1.f / (1.f + ${function:exp}(-${0}))"; }
    auto apply() const
    {
        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };

--- a/src/include/migraphx/op/sign.hpp
+++ b/src/include/migraphx/op/sign.hpp
@@ -18,6 +18,7 @@ namespace op {

 struct sign : unary<sign>
 {
+    std::string point_op() const { return "(${0} > 0 ? 1 : ((${0} < 0) ? -1 : 0))"; }
    auto apply() const
    {
        return [](auto x) { return (x > 0 ? 1 : ((x < 0) ? -1 : 0)); };

--- a/src/include/migraphx/op/topk.hpp
+++ b/src/include/migraphx/op/topk.hpp
@@ -45,7 +45,7 @@ struct topk
        shape s_val{type, lens};
        shape s_ind{shape::int64_type, lens};

-        return shape({s_val, s_ind});
+        return {{s_val, s_ind}};
    }

    template <class T, class Compare>
@@ -131,7 +131,7 @@ struct topk
            });
        });

-        return argument({res_val, res_ind});
+        return {{res_val, res_ind}};
    }
 };


--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -103,79 +103,69 @@ auto operator==(const T& x, const U& y) -> decltype(x.name() == y.name())
 } // namespace operation_operators

 template <class T>
-auto normalize_compute_shape_op(rank<1>, const T& x, const std::vector<shape>& inputs)
-    -> decltype(x.normalize_compute_shape(inputs))
-{
-    dependent_type<operation, T> y = x;
-    normalize_attributes(y, inputs[0].lens());
-    return any_cast<T>(y).normalize_compute_shape(inputs);
-}
-
-template <class T>
-shape normalize_compute_shape_op(rank<0>, const T& x, const std::vector<shape>&)
+auto compute_shape_op(rank<3>, const T& x, const std::vector<shape>& inputs)
+    -> decltype(x.compute_shape(inputs))
 {
-    std::string name = x.name();
-    MIGRAPHX_THROW("Shape not computable: " + name);
+    return x.compute_shape(inputs);
 }

 template <class T>
-shape normalize_compute_shape_op(const T& x, const std::vector<shape>& inputs)
+auto compute_shape_op(rank<2>, const T& x, const std::vector<shape>& inputs)
+    -> decltype(x.normalize_compute_shape(inputs))
 {
-    return normalize_compute_shape_op(rank<1>{}, x, inputs);
+    dependent_type<operation, T> y = x;
+    normalize_attributes(y, inputs[0].lens());
+    return any_cast<T>(y).normalize_compute_shape(inputs);
 }

 template <class T>
-auto compute_shape_op(rank<1>,
-                      const T& x,
-                      const std::vector<shape>& inputs,
-                      const std::vector<module_ref>& mod_args)
-    -> decltype(x.compute_shape(inputs, mod_args))
+auto compute_shape_op(rank<1>, const T& x, const std::vector<shape>& inputs)
+    -> decltype(x.compute_shape(inputs, {}))
 {
-    return x.compute_shape(inputs, mod_args);
+    return x.compute_shape(inputs, {});
 }

 template <class T>
-shape
-    compute_shape_op(rank<0>, const T& x, const std::vector<shape>&, const std::vector<module_ref>&)
+shape compute_shape_op(rank<0>, const T& x, const std::vector<shape>&)
 {
    std::string name = x.name();
    MIGRAPHX_THROW("Shape not computable: " + name);
 }

 template <class T>
-shape compute_shape_op(const T& x,
-                       const std::vector<shape>& inputs,
-                       const std::vector<module_ref>& mod_args)
+shape compute_shape_op(const T& x, const std::vector<shape>& inputs)
 {
-    return compute_shape_op(rank<1>{}, x, inputs, mod_args);
+    return compute_shape_op(rank<3>{}, x, inputs);
 }

 template <class T>
-auto normalize_compute_shape_op(rank<1>,
-                                const T& x,
-                                const std::vector<shape>& inputs,
-                                std::vector<module_ref>& mod_args)
-    -> decltype(x.normalize_compute_shape(inputs, mod_args))
+auto mod_compute_shape_op(rank<1>,
+                          const T& x,
+                          const std::vector<shape>& inputs,
+                          const std::vector<module_ref>& mod_args)
+    -> decltype(x.compute_shape(inputs, mod_args))
 {
-    return x.normalize_compute_shape(inputs, mod_args);
+    return x.compute_shape(inputs, mod_args);
 }

 template <class T>
-shape normalize_compute_shape_op(rank<0>,
-                                 const T& x,
-                                 const std::vector<shape>&,
-                                 const std::vector<module_ref>&)
+shape mod_compute_shape_op(rank<0>,
+                           const T& x,
+                           const std::vector<shape>& inputs,
+                           const std::vector<module_ref>& mod_args)
 {
+    if(mod_args.empty())
+        return compute_shape_op(x, inputs);
    std::string name = x.name();
    MIGRAPHX_THROW("Shape not computable: " + name);
 }

 template <class T>
-shape normalize_compute_shape_op(const T& x,
-                                 const std::vector<shape>& inputs,
-                                 std::vector<module_ref>& mod_args)
+shape mod_compute_shape_op(const T& x,
+                           const std::vector<shape>& inputs,
+                           const std::vector<module_ref>& mod_args)
 {
-    return normalize_compute_shape_op(rank<1>{}, x, inputs, mod_args);
+    return mod_compute_shape_op(rank<1>{}, x, inputs, mod_args);
 }

 template <class T>
@@ -848,7 +838,7 @@ struct operation
                                                         T&& private_detail_te_self,
                                                         const std::vector<shape>& input)
    {
-        return detail::normalize_compute_shape_op(private_detail_te_self, input);
+        return detail::compute_shape_op(private_detail_te_self, input);
    }

    template <class T>
@@ -867,7 +857,7 @@ struct operation
                                                         const std::vector<shape>& inputs,
                                                         const std::vector<module_ref>& mod_args)
    {
-        return detail::compute_shape_op(private_detail_te_self, inputs, mod_args);
+        return detail::mod_compute_shape_op(private_detail_te_self, inputs, mod_args);
    }

    template <class T>
@@ -1269,7 +1259,7 @@ template <class T>
 inline auto compute_shape(const T& op, const std::vector<shape>& inputs)
    -> decltype(op.normalize_compute_shape(inputs))
 {
-    return detail::normalize_compute_shape_op(op, inputs);
+    return detail::compute_shape_op(op, inputs);
 }

 inline shape compute_shape(const operation& op,
@@ -1294,7 +1284,7 @@ inline auto compute_shape(const T& op,
                          const std::vector<module_ref>& mod_args)
    -> decltype(op.normalize_compute_shape(inputs, mod_args))
 {
-    return detail::normalize_compute_shape_op(op, inputs, mod_args);
+    return detail::compute_shape_op(op, inputs, mod_args);
 }

 inline bool is_context_free(const operation& op) { return op.is_context_free(); }

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -57,6 +57,7 @@
 #include <migraphx/op/mul.hpp>
 #include <migraphx/op/multibroadcast.hpp>
 #include <migraphx/op/neg.hpp>
+#include <migraphx/op/nonmaxsuppression.hpp>
 #include <migraphx/op/nonzero.hpp>
 #include <migraphx/op/outline.hpp>
 #include <migraphx/op/pad.hpp>
@@ -80,6 +81,7 @@
 #include <migraphx/op/rnn_last_hs_output.hpp>
 #include <migraphx/op/rnn_variable_seq_lens.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/op/roialign.hpp>
 #include <migraphx/op/round.hpp>
 #include <migraphx/op/rsqrt.hpp>
 #include <migraphx/op/scalar.hpp>

--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -23,6 +23,8 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_EVAL)

 struct program_impl;

+struct marker;
+
 /**
 * @brief Stores the instruction stream
 */
@@ -65,7 +67,10 @@ struct program

    void finalize();

-    void perf_report(std::ostream& os, std::size_t n, parameter_map params) const;
+    void
+    perf_report(std::ostream& os, std::size_t n, parameter_map params, std::size_t batch = 1) const;
+
+    void mark(const parameter_map& params, marker&& m);

    value to_value() const;
    void from_value(const value& v);

--- a/src/include/migraphx/run_loop.hpp
+++ b/src/include/migraphx/run_loop.hpp
@@ -106,7 +106,7 @@ argument run_loop(const LoopModel& model,
    std::copy(in_args.begin() + 2, in_args.end(), out_args.begin());
    model.set_zero(ctx, scan_outputs, iter);

-    return argument(out_args);
+    return {out_args};
 }

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/stringutils.hpp
+++ b/src/include/migraphx/stringutils.hpp
@@ -18,7 +18,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 template <class F>
 auto with_char(F f)
 {
-    return [=](unsigned char c) { return f(c); };
+    return [=](unsigned char c) -> bool { return f(c); };
 }

 inline std::string
@@ -71,7 +71,7 @@ std::string trim(const std::string& s, F f)
 {
    auto start = std::find_if_not(s.begin(), s.end(), f);
    auto last  = std::find_if_not(s.rbegin(), std::string::const_reverse_iterator(start), f).base();
-    return std::string(start, last);
+    return {start, last};
 }

 inline std::string trim(const std::string& s)
@@ -120,22 +120,27 @@ interpolate_string(const std::string& input, F f, std::string start = "${", std:
        result.append(it, next_start);
        if(next_start == input.end())
            break;
-        auto r = f(next_start + start.size(), next_end - end.size() + 1);
+        auto r = f(next_start + start.size(), next_end);
        result.append(r.begin(), r.end());
-        it = next_end + 1;
+        it = next_end + end.size();
    }
    return result;
 }
 inline std::string interpolate_string(const std::string& input,
-                                      const std::unordered_map<std::string, std::string>& vars)
-{
-    return interpolate_string(input, [&](auto start, auto last) {
-        auto key = trim({start, last});
-        auto it  = vars.find(key);
-        if(it == vars.end())
-            throw std::runtime_error("Unknown key: " + key);
-        return it->second;
-    });
+                                      const std::unordered_map<std::string, std::string>& vars,
+                                      std::string start = "${",
+                                      std::string end   = "}")
+{
+    return interpolate_string(input,
+                              [&](auto start_it, auto last_it) {
+                                  auto key = trim({start_it, last_it});
+                                  auto it  = vars.find(key);
+                                  if(it == vars.end())
+                                      throw std::runtime_error("Unknown key: " + key);
+                                  return it->second;
+                              },
+                              std::move(start),
+                              std::move(end));
 }

 template <class Iterator>
@@ -163,7 +168,8 @@ inline std::string to_string_range(const std::initializer_list<T>& r)
 }

 template <class T>
-inline std::string to_string(const T& x)
+inline auto to_string(const T& x)
+    -> decltype((std::declval<std::stringstream>() << x), std::string{})
 {
    std::stringstream ss;
    ss << x;

--- a/src/inline_module.cpp
+++ b/src/inline_module.cpp
@@ -11,49 +11,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 static void inline_submodule(module& m, instruction_ref ins, bool cond)
 {
    const auto& mod_inputs = ins->module_inputs();
-    const auto* smod       = cond ? mod_inputs.at(0) : mod_inputs.at(1);
-
-    std::unordered_map<instruction_ref, instruction_ref> map_ins;
-    std::vector<instruction_ref> mod_outputs;
-    for(auto sins : iterator_for(*smod))
-    {
-        instruction_ref copy_ins{};
-        if(sins->name() == "@literal")
-        {
-            auto l   = sins->get_literal();
-            copy_ins = m.add_literal(l);
-        }
-        else if(sins->name() == "@param")
-        {
-            auto&& name = any_cast<builtin::param>(sins->get_operator()).parameter;
-            auto s      = sins->get_shape();
-            copy_ins    = m.add_parameter(name, s);
-        }
-        else if(sins->name() == "@outline")
-        {
-            auto s   = sins->get_shape();
-            copy_ins = m.add_outline(s);
-        }
-        else
-        {
-            auto mod_args = sins->module_inputs();
-            auto inputs   = sins->inputs();
-            std::vector<instruction_ref> copy_inputs(inputs.size());
-            std::transform(inputs.begin(), inputs.end(), copy_inputs.begin(), [&](auto i) {
-                return contains(map_ins, i) ? map_ins[i] : i;
-            });
-
-            if(sins->name() == "@return")
-            {
-                mod_outputs = copy_inputs;
-                break;
-            }
-
-            copy_ins = m.insert_instruction(ins, sins->get_operator(), copy_inputs, mod_args);
-        }
-        map_ins[sins] = copy_ins;
-        mod_outputs   = {copy_ins};
-    }
+    module_ref smod        = cond ? mod_inputs.at(0) : mod_inputs.at(1);
+    auto mod_outputs       = m.insert_module_instructions(ins, smod);

    auto ins_outputs = ins->outputs();
    assert(mod_outputs.size() >= ins_outputs.size());

--- a/src/instruction.cpp
+++ b/src/instruction.cpp
@@ -468,5 +468,11 @@ std::vector<shape> try_compute_shape(const operation& op, const std::vector<shap
    }
    return {new_shape};
 }
+
+migraphx::instruction* as_address(const instruction_ref& ins) noexcept
+{
+    return std::addressof(*ins);
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/module.cpp
+++ b/src/module.cpp
+#include <iterator>
 #include <migraphx/module.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/instruction.hpp>
@@ -178,6 +179,7 @@ instruction_ref module::insert_instruction(instruction_ref ins,
                                           const operation& op,
                                           std::vector<instruction_ref> args)
 {
+    assert(has_instruction(ins) or is_end(ins, this->end()));
    assert(not starts_with(op.name(), "@"));
    shape r     = compute_shape(op, args);
    auto result = impl->insert(ins, {op, r, std::move(args)});
@@ -199,6 +201,7 @@ instruction_ref module::insert_instruction(instruction_ref ins,
                                           std::vector<instruction_ref> args,
                                           std::vector<module_ref> module_args)
 {
+    assert(has_instruction(ins) or is_end(ins, this->end()));
    assert(not starts_with(op.name(), "@"));
    auto out_shape = compute_shape(op, args, module_args);
    auto result    = impl->insert(ins, {op, out_shape, std::move(args), std::move(module_args)});
@@ -211,6 +214,7 @@ instruction_ref module::replace_instruction(instruction_ref ins,
                                            const operation& op,
                                            std::vector<instruction_ref> args) MIGRAPHX_TIDY_CONST
 {
+    assert(has_instruction(ins));
    assert(not starts_with(op.name(), "@"));

    shape r = compute_shape(op, args);
@@ -224,6 +228,7 @@ instruction_ref module::replace_instruction(instruction_ref ins,
                                            std::vector<instruction_ref> args,
                                            std::vector<module_ref> module_args) MIGRAPHX_TIDY_CONST
 {
+    assert(has_instruction(ins));
    assert(not starts_with(op.name(), "@"));
    auto out_shape = compute_shape(op, args, module_args);
    instruction::replace(ins, op, out_shape, std::move(args), std::move(module_args));
@@ -290,6 +295,8 @@ instruction_ref module::remove_instructions(instruction_ref first, instruction_r

 instruction_ref module::move_instruction(instruction_ref src, instruction_ref dst)
 {
+    assert(has_instruction(src));
+    assert(has_instruction(dst) or is_end(dst, this->end()));
    impl->instructions.splice(dst, impl->instructions, src);
    return src;
 }
@@ -302,6 +309,55 @@ instruction_ref module::move_instructions(instruction_ref src, instruction_ref d
    return src;
 }

+std::vector<instruction_ref> module::insert_module_instructions(
+    instruction_ref ins, module_ref m, std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    std::vector<instruction_ref> mod_outputs;
+    for(auto sins : iterator_for(*m))
+    {
+        if(contains(map_ins, sins))
+            continue;
+        instruction_ref copy_ins;
+        if(sins->name() == "@literal")
+        {
+            auto l   = sins->get_literal();
+            copy_ins = this->add_literal(l);
+        }
+        else if(sins->name() == "@param")
+        {
+            auto&& name = any_cast<builtin::param>(sins->get_operator()).parameter;
+            auto s      = sins->get_shape();
+            copy_ins    = this->add_parameter(name, s);
+        }
+        else if(sins->name() == "@outline")
+        {
+            auto s   = sins->get_shape();
+            copy_ins = this->add_outline(s);
+        }
+        else
+        {
+            auto mod_args = sins->module_inputs();
+            auto inputs   = sins->inputs();
+            std::vector<instruction_ref> copy_inputs(inputs.size());
+            std::transform(inputs.begin(), inputs.end(), copy_inputs.begin(), [&](auto i) {
+                return contains(map_ins, i) ? map_ins[i] : i;
+            });
+
+            if(sins->name() == "@return")
+            {
+                mod_outputs = copy_inputs;
+                break;
+            }
+
+            copy_ins = this->insert_instruction(ins, sins->get_operator(), copy_inputs, mod_args);
+        }
+        map_ins[sins] = copy_ins;
+    }
+    if(mod_outputs.empty())
+        mod_outputs = {map_ins.at(std::prev(m->end()))};
+    return mod_outputs;
+}
+
 instruction_ref module::add_literal(literal l)
 {
    impl->emplace_front(std::move(l));
@@ -332,6 +388,20 @@ instruction_ref module::add_return(std::vector<instruction_ref> args)
    return result;
 }

+instruction_ref module::replace_return(std::vector<instruction_ref> args)
+{
+    auto last = std::prev(this->end());
+    // If there is no return then add a return
+    if(last->name() != "@return")
+        return this->add_return(args);
+
+    shape r = compute_shape(last->get_operator(), args);
+    instruction::replace(last, last->get_operator(), r, std::move(args));
+    assert(last->valid(begin()));
+
+    return last;
+}
+
 shape module::get_parameter_shape(std::string name) const
 {
    auto ins = std::find_if(

--- a/src/normalize_attributes.cpp
+++ b/src/normalize_attributes.cpp
@@ -20,7 +20,7 @@ auto tune_attribute(const std::vector<int64_t>& vec,
                    const std::vector<std::size_t>& lens)
 {
    std::vector<int64_t> result(vec);
-    int64_t n_rank                                 = static_cast<int64_t>(lens.size());
+    int64_t n_rank                                 = lens.size();
    std::vector<op::normalize_attribute> vec_attrs = val.to_vector<op::normalize_attribute>();
    if(contains(vec_attrs, op::normalize_attribute::use_output))
    {

--- a/src/onnx/parse_gather_elements.cpp
+++ b/src/onnx/parse_gather_elements.cpp
@@ -39,7 +39,7 @@ struct parse_gather_elements : op_parser<parse_gather_elements>
        int tuned_axis = tune_axis(n_rank, axis, opd.op_name);

        auto axis_stride      = data_s.strides()[tuned_axis];
-        int64_t data_elem_num = static_cast<int64_t>(data_s.elements());
+        int64_t data_elem_num = data_s.elements();
        // reshape the input data as one dimension and used as input data
        // to the gather operator
        arg_data = info.add_instruction(make_op("reshape", {{"dims", {data_elem_num}}}), arg_data);

--- a/src/onnx/parse_generic_op.cpp
+++ b/src/onnx/parse_generic_op.cpp
@@ -32,6 +32,7 @@ struct parse_generic_op : op_parser<parse_generic_op>
                {"Log", "log"},
                {"LRN", "lrn"},
                {"Neg", "neg"},
+                {"NonMaxSuppression", "nonmaxsuppression"},
                {"Reciprocal", "recip"},
                {"Relu", "relu"},
                {"Round", "round"},
@@ -49,7 +50,7 @@ struct parse_generic_op : op_parser<parse_generic_op>

    bool needs_contiguous(const std::string& op_name) const
    {
-        return contains({"flatten", "gather", "scatter"}, op_name);
+        return contains({"flatten", "gather", "nonmaxsuppression", "scatter"}, op_name);
    }

    instruction_ref parse(const op_desc& opd,

--- a/src/onnx/parse_greaterorequal.cpp
+++ b/src/onnx/parse_greaterorequal.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_greaterorequal : op_parser<parse_greaterorequal>
+{
+    std::vector<op_desc> operators() const { return {{"GreaterOrEqual"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto in_res = info.add_broadcastable_binary_op("less", args[0], args[1]);
+        if(in_res->get_shape().type() != shape::bool_type)
+        {
+            in_res = info.add_instruction(make_op("convert", {{"target_type", shape::bool_type}}),
+                                          in_res);
+        }
+        return info.add_instruction(make_op("not"), in_res);
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_hardsigmoid.cpp
+++ b/src/onnx/parse_hardsigmoid.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_hardsigmoid : op_parser<parse_hardsigmoid>
+{
+    std::vector<op_desc> operators() const { return {{"HardSigmoid"}, {"HardSwish"}}; }
+
+    instruction_ref parse(const op_desc& opd,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        float alpha = 0.2;
+        float beta  = 0.5;
+        if(opd.onnx_name == "HardSwish")
+        {
+            alpha = 1.0 / 6.0;
+        }
+        else
+        {
+            if(contains(info.attributes, "alpha"))
+                alpha = info.attributes.at("alpha").f();
+
+            if(contains(info.attributes, "beta"))
+                beta = info.attributes.at("beta").f();
+        }
+
+        auto input_lens = args[0]->get_shape().lens();
+        auto input_type = args[0]->get_shape().type();
+        auto mb_alpha   = info.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}),
+            info.add_literal(migraphx::literal{migraphx::shape{input_type}, {alpha}}));
+        auto mb_beta = info.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}),
+            info.add_literal(migraphx::literal{migraphx::shape{input_type}, {beta}}));
+        auto mb_zero = info.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}),
+            info.add_literal(migraphx::literal{migraphx::shape{input_type}, {0}}));
+        auto mb_one = info.add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", input_lens}}),
+            info.add_literal(migraphx::literal{migraphx::shape{input_type}, {1}}));
+
+        auto mul         = info.add_instruction(migraphx::make_op("mul"), mb_alpha, args[0]);
+        auto add         = info.add_instruction(migraphx::make_op("add"), mb_beta, mul);
+        auto hardsigmoid = info.add_instruction(migraphx::make_op("clip"), add, mb_zero, mb_one);
+        if(opd.onnx_name == "HardSwish")
+            return info.add_instruction(migraphx::make_op("mul"), args[0], hardsigmoid);
+
+        return hardsigmoid;
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_mean.cpp
+++ b/src/onnx/parse_mean.cpp
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/onnx/checks.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_mean : op_parser<parse_mean>
+{
+    std::vector<op_desc> operators() const { return {{"Mean"}}; }
+
+    /// Calculates the element-wise mean of n>=1 input tensors
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          const onnx_parser::node_info& info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto num_data = args.size();
+        if(num_data == 1)
+            return args[0];
+
+        auto divisor = info.add_literal(
+            migraphx::literal{migraphx::shape{args[0]->get_shape().type()}, {num_data}});
+
+        return std::accumulate(args.begin(), args.end(), args[0], [&](auto& mean, auto& data_i) {
+            // Pre-divide each tensor element-wise by n to reduce risk of overflow during summation
+            data_i = info.add_broadcastable_binary_op("div", data_i, divisor);
+
+            if(data_i != args[0])
+                return info.add_broadcastable_binary_op("add", mean, data_i);
+            return data_i;
+        });
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_multinomial.cpp
+++ b/src/onnx/parse_multinomial.cpp
@@ -27,11 +27,6 @@ struct parse_multinomial : op_parser<parse_multinomial>
        if(contains(info.attributes, "sample_size"))
            sample_size = info.attributes.at("sample_size").i();

-        float seed = static_cast<float>(
-            std::chrono::high_resolution_clock::now().time_since_epoch().count());
-        if(contains(info.attributes, "seed"))
-            seed = info.attributes.at("seed").f();
-
        // Subtract the per-batch maximum log-probability, making the per-batch max 0
        auto maxes =
            info.add_instruction(migraphx::make_op("reduce_max", {{"axes", {1}}}), args[0]);
@@ -46,7 +41,10 @@ struct parse_multinomial : op_parser<parse_multinomial>
            migraphx::make_op("prefix_scan_sum", {{"axis", 1}, {"exclusive", false}}), cdf);

        // Pre-compute random distribution
-        std::mt19937 gen(seed);
+        std::mt19937 gen(std::chrono::high_resolution_clock::now().time_since_epoch().count());
+        if(contains(info.attributes, "seed"))
+            gen.seed(info.attributes.at("seed").f());
+
        std::uniform_real_distribution<> dis(0.0, 1.0);
        size_t batch_size = args[0]->get_shape().lens().front();
        migraphx::shape dist_shape{migraphx::shape::float_type, {batch_size, sample_size}};

--- a/src/onnx/parse_pow.cpp
+++ b/src/onnx/parse_pow.cpp
@@ -9,21 +9,20 @@ namespace onnx {

 auto compute_type(shape::type_t t1, shape::type_t t2)
 {
-    const static std::unordered_map<int, int> op_order = {
-        {static_cast<int>(shape::int8_type), 1},
-        {static_cast<int>(shape::uint8_type), 2},
-        {static_cast<int>(shape::int16_type), 3},
-        {static_cast<int>(shape::uint16_type), 4},
-        {static_cast<int>(shape::int32_type), 5},
-        {static_cast<int>(shape::uint32_type), 6},
-        {static_cast<int>(shape::int64_type), 7},
-        {static_cast<int>(shape::uint64_type), 8},
-        {static_cast<int>(shape::half_type), 9},
-        {static_cast<int>(shape::float_type), 10},
-        {static_cast<int>(shape::double_type), 11}};
+    const static std::unordered_map<int, int> op_order = {{shape::int8_type, 1},
+                                                          {shape::uint8_type, 2},
+                                                          {shape::int16_type, 3},
+                                                          {shape::uint16_type, 4},
+                                                          {shape::int32_type, 5},
+                                                          {shape::uint32_type, 6},
+                                                          {shape::int64_type, 7},
+                                                          {shape::uint64_type, 8},
+                                                          {shape::half_type, 9},
+                                                          {shape::float_type, 10},
+                                                          {shape::double_type, 11}};

-    int it1 = static_cast<int>(t1);
-    int it2 = static_cast<int>(t2);
+    int it1 = t1;
+    int it2 = t2;
    if(!contains(op_order, it1) or !contains(op_order, it2))
    {
        MIGRAPHX_THROW("PARSE_POW: Input data type not supported!");