Merge branch 'batch_report' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into mi100_opts

0369e974 · Khalique Ahmed · 3a474fca · d70fd0df · 0369e974 · 0369e974
Commit 0369e974 authored Nov 10, 2021 by Khalique Ahmed
20 changed files
--- a/src/include/migraphx/op/dot.hpp
+++ b/src/include/migraphx/op/dot.hpp
@@ -18,19 +18,10 @@ namespace op {
 struct dot
 {
-    float alpha = 1.0;
-    float beta  = 1.0;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return pack(f(self.alpha, "alpha"), f(self.beta, "beta"));
-    }
    std::string name() const { return "dot"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.same_type();
+        check_shapes{inputs, *this}.same_type().has(2);
        const shape& a = inputs.at(0);
        const shape& b = inputs.at(1);
        auto t         = a.type();
@@ -58,25 +49,14 @@ struct dot
        auto out_lens   = a.lens();
        out_lens[dim_1] = b.lens()[dim_1];
-        if(inputs.size() == 3 && out_lens != inputs.at(2).lens())
-        {
-            MIGRAPHX_THROW("DOT: dimension mismatch, operand C: {" +
-                           to_string_range(inputs.at(2).lens()) +
-                           "}, cannot add to operand A * B: {" + to_string_range(out_lens) + "}");
-        }
        return {t, out_lens};
    }
    argument compute(shape output_shape, std::vector<argument> args) const
    {
-        argument result;
+        argument result = argument{output_shape};
-        if(args.size() == 3)
-            result = args[2];
-        else
-            result = argument{output_shape};
        visit_all(result, args[0], args[1])(
-            [&](auto cmat, auto amat, auto bmat) { gemm(cmat, amat, bmat, alpha, beta); });
+            [&](auto cmat, auto amat, auto bmat) { gemm(cmat, amat, bmat, 1.0f, 0.0f); });
        return result;
    }
 };

--- a/src/include/migraphx/op/if_op.hpp
+++ b/src/include/migraphx/op/if_op.hpp
@@ -35,7 +35,7 @@ struct if_op
            MIGRAPHX_THROW("IF: output shapes of submodules must be the same.");
        }
-        return shape(out_shapes0);
+        return {out_shapes0};
    }
    argument compute(const shape&,

--- a/src/include/migraphx/op/loop.hpp
+++ b/src/include/migraphx/op/loop.hpp
@@ -54,7 +54,7 @@ struct loop
            ins_out_shapes.push_back({out_s.type(), lens});
        }
-        return shape(ins_out_shapes);
+        return {ins_out_shapes};
    }
    struct ref_loop

--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_NONMAXSUPPRESSION_HPP
+#define MIGRAPHX_GUARD_OPERATORS_NONMAXSUPPRESSION_HPP
+#include <cmath>
+#include <queue>
+#include <cstdint>
+#include <iterator>
+#include <migraphx/config.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/tensor_view.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/output_iterator.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct nonmaxsuppression
+{
+    bool center_point_box = false;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.center_point_box, "center_point_box"));
+    }
+    std::string name() const { return "nonmaxsuppression"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // requires at least 2 inputs
+        check_shapes{inputs, *this}.standard();
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.only_dims(3);
+        auto lens = inputs.front().lens();
+        // check input shape
+        if(lens[1] != inputs.at(1).lens()[2])
+        {
+            MIGRAPHX_THROW("NonMaxSuppression: dimension mismatch between first and second input!");
+        }
+        std::vector<int64_t> out_lens(2);
+        out_lens.at(0) = lens.at(1);
+        out_lens.at(1) = 3;
+        return {shape::int64_type, out_lens};
+    }
+    struct box
+    {
+        std::array<float, 2> x;
+        std::array<float, 2> y;
+        void sort()
+        {
+            std::sort(x.begin(), x.end());
+            std::sort(y.begin(), y.end());
+        }
+        std::array<float, 2>& operator[](std::size_t i) { return i == 0 ? x : y; }
+        float area() const
+        {
+            assert(std::is_sorted(x.begin(), x.end()));
+            assert(std::is_sorted(y.begin(), y.end()));
+            return (x[1] - x[0]) * (y[1] - y[0]);
+        }
+    };
+    template <class T>
+    box batch_box(const T* boxes, std::size_t bidx) const
+    {
+        box result{};
+        const T* start = boxes + 4 * bidx;
+        if(center_point_box)
+        {
+            float half_width  = start[2] / 2.0f;
+            float half_height = start[3] / 2.0f;
+            float x_center    = start[0];
+            float y_center    = start[1];
+            result.x          = {x_center - half_width, x_center + half_width};
+            result.y          = {y_center - half_height, y_center + half_height};
+        }
+        else
+        {
+            result.x = {start[1], start[3]};
+            result.y = {start[0], start[2]};
+        }
+        return result;
+    }
+    inline bool suppress_by_iou(box b1, box b2, float iou_threshold) const
+    {
+        b1.sort();
+        b2.sort();
+        box intersection{};
+        for(auto i : range(2))
+        {
+            intersection[i][0] = std::max(b1[i][0], b2[i][0]);
+            intersection[i][1] = std::min(b1[i][1], b2[i][1]);
+        }
+        std::vector<std::array<float, 2>> bbox = {intersection.x, intersection.y};
+        if(std::any_of(bbox.begin(), bbox.end(), [](auto bx) {
+               return not std::is_sorted(bx.begin(), bx.end());
+           }))
+        {
+            return false;
+        }
+        const float area1             = b1.area();
+        const float area2             = b2.area();
+        const float intersection_area = intersection.area();
+        const float union_area        = area1 + area2 - intersection_area;
+        if(area1 <= .0f or area2 <= .0f or union_area <= .0f)
+        {
+            return false;
+        }
+        const float intersection_over_union = intersection_area / union_area;
+        return intersection_over_union > iou_threshold;
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        result.visit([&](auto out) { std::fill(out.begin(), out.end(), 0); });
+        std::size_t max_output_boxes_per_class = 0;
+        float iou_threshold                    = 0.0f;
+        float score_threshold                  = 0.0f;
+        if(args.size() > 2)
+        {
+            max_output_boxes_per_class = args.at(2).at<std::size_t>();
+        }
+        // max_output_boxes_per_class is 0, no output
+        if(max_output_boxes_per_class == 0)
+        {
+            return result;
+        }
+        if(args.size() > 3)
+        {
+            iou_threshold = args.at(3).at<float>();
+        }
+        if(args.size() > 4)
+        {
+            score_threshold = args.at(4).at<float>();
+        }
+        const auto& lens = args.at(1).get_shape().lens();
+        auto batch_num   = lens[0];
+        auto class_num   = lens[1];
+        auto box_num     = args.at(0).get_shape().lens()[1];
+        std::vector<std::pair<float, int64_t>> selected_boxes_inside_class;
+        std::vector<int64_t> selected_indices;
+        selected_boxes_inside_class.reserve(output_shape.elements());
+        auto scores        = make_view<float>(args.at(1).get_shape(), args.at(1).cast<float>());
+        const float* boxes = args.at(0).cast<float>();
+        shape comp_s{shape::float_type, {batch_num, class_num}};
+        shape_for_each(comp_s, [&](auto idx) {
+            auto bidx = idx[0];
+            auto cidx = idx[1];
+            std::size_t score_offset = (bidx * class_num + cidx) * box_num;
+            const float* batch_boxes = boxes + bidx * box_num * 4;
+            std::priority_queue<std::pair<float, int64_t>> sorted_boxes;
+            auto insert_to_sorted_boxes =
+                make_function_output_iterator([&](const auto& x) { sorted_boxes.push(x); });
+            int64_t box_idx = 0;
+            transform_if(scores.begin() + score_offset,
+                         scores.begin() + score_offset + box_num,
+                         insert_to_sorted_boxes,
+                         [&](auto sc) {
+                             box_idx++;
+                             return sc >= score_threshold;
+                         },
+                         [&](auto sc) { return std::make_pair(sc, box_idx - 1); });
+            selected_boxes_inside_class.clear();
+            // Get the next box with top score, filter by iou_threshold
+            while(!sorted_boxes.empty() &&
+                  selected_boxes_inside_class.size() < max_output_boxes_per_class)
+            {
+                const std::pair<float, int64_t>& next_top_score = sorted_boxes.top();
+                // Check with existing selected boxes for this class, suppress if exceed the IOU
+                // (Intersection Over Union) threshold
+                bool not_selected = std::any_of(
+                    selected_boxes_inside_class.begin(),
+                    selected_boxes_inside_class.end(),
+                    [&](auto selected_index) {
+                        return this->suppress_by_iou(batch_box(batch_boxes, next_top_score.second),
+                                                     batch_box(batch_boxes, selected_index.second),
+                                                     iou_threshold);
+                    });
+                if(not not_selected)
+                {
+                    selected_boxes_inside_class.push_back(next_top_score);
+                    selected_indices.push_back(bidx);
+                    selected_indices.push_back(cidx);
+                    selected_indices.push_back(next_top_score.second);
+                }
+                sorted_boxes.pop();
+            }
+        });
+        result.visit([&](auto out) {
+            std::copy(selected_indices.begin(), selected_indices.end(), out.begin());
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/nonzero.hpp
+++ b/src/include/migraphx/op/nonzero.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_NONZERO_HPP
+#define MIGRAPHX_GUARD_OPERATORS_NONZERO_HPP
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/par_for.hpp>
+#include <cmath>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct nonzero
+{
+    std::string name() const { return "nonzero"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1).standard();
+        auto elem_num                     = inputs[0].elements();
+        auto dim_num                      = inputs[0].lens().size();
+        std::vector<std::size_t> out_lens = {dim_num, elem_num};
+        return {shape::int64_type, out_lens};
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        std::vector<std::vector<std::size_t>> vec_idx;
+        auto s = args.front().get_shape();
+        args.front().visit([&](auto v) {
+            shape_for_each(s, [&](auto idx) {
+                if(not float_equal(v[s.index(idx)], 0))
+                {
+                    vec_idx.push_back(idx);
+                }
+            });
+        });
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            std::fill(output.begin(), output.end(), 0);
+            par_for(vec_idx.size(), [&](auto i) {
+                for(std::size_t j = 0; j < vec_idx.front().size(); ++j)
+                {
+                    output[output_shape.index({j, i})] = vec_idx[i][j];
+                }
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/pointwise.hpp
+++ b/src/include/migraphx/op/pointwise.hpp
+#ifndef MIGRAPHX_GUARD_OP_POINTWISE_HPP
+#define MIGRAPHX_GUARD_OP_POINTWISE_HPP
+#include <migraphx/config.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/par_for.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct pointwise
+{
+    std::string name() const { return "pointwise"; }
+    shape compute_shape(const std::vector<shape>& inputs, std::vector<module_ref> mods) const
+    {
+        if(mods.size() != 1)
+        {
+            MIGRAPHX_THROW("should have one submodule.");
+        }
+        auto* pm    = mods.front();
+        auto pnames = pm->get_parameter_names();
+        std::sort(pnames.begin(), pnames.end());
+        check_shapes{inputs, *this}.has(pnames.size()).same_dims();
+        for(auto i : range(pnames.size()))
+        {
+            auto s1 = pm->get_parameter(pnames[i])->get_shape();
+            auto s2 = inputs[i];
+            if(s1.type() != s2.type())
+                MIGRAPHX_THROW("Mismatch type");
+        }
+        if(pm->get_output_shapes().size() != 1)
+            MIGRAPHX_THROW("submodule should have only one output.");
+        auto type = pm->get_output_shapes().front().type();
+        return shape::from_permutation(type, inputs.front().lens(), find_permutation(inputs));
+    }
+    argument compute(const shape& output_shape,
+                     const std::vector<argument>& args,
+                     const std::vector<module_ref>& mods,
+                     const std::function<std::vector<argument>(
+                         module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
+    {
+        argument output{output_shape};
+        auto* pm    = mods.front();
+        auto pnames = pm->get_parameter_names();
+        std::sort(pnames.begin(), pnames.end());
+        par_for(output_shape.elements(), [&](auto i) {
+            std::unordered_map<std::string, argument> params;
+            std::transform(
+                pnames.begin(),
+                pnames.end(),
+                args.begin(),
+                std::inserter(params, params.end()),
+                [&](auto&& name, auto&& arg) { return std::make_pair(name, arg.element(i)); });
+            auto results = run(pm, params);
+            assert(results.size() == 1);
+            visit_all(output, results.front())([&](auto out, auto x) { out[i] = x.front(); });
+        });
+        return output;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_OP_POINTWISE_HPP
--- a/src/include/migraphx/op/quant_dot.hpp
+++ b/src/include/migraphx/op/quant_dot.hpp
@@ -18,21 +18,12 @@ namespace op {
 struct quant_dot
 {
-    int32_t alpha = 1;
-    int32_t beta  = 1;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return pack(f(self.alpha, "alpha"), f(self.beta, "beta"));
-    }
    value attributes() const { return {{"general_data_type", "dot"}}; }
    std::string name() const { return "quant_dot"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type();
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type().has(2);
        const shape& a = inputs.at(0);
        const shape& b = inputs.at(1);
        auto t         = a.type();
@@ -64,18 +55,6 @@ struct quant_dot
        auto out_lens   = a.lens();
        out_lens[dim_1] = b.lens()[dim_1];
-        if(inputs.size() == 3 && out_lens != inputs.at(2).lens())
-        {
-            MIGRAPHX_THROW("QUANT_DOT: dimension mismatch, operand C: {" +
-                           to_string_range(inputs.at(2).lens()) +
-                           "}, cannot add to operand A * B: {" + to_string_range(out_lens) + "}");
-        }
-        if(inputs.size() == 3 && inputs.at(2).type() != shape::int32_type)
-        {
-            MIGRAPHX_THROW("QUANT_DOT: operand C type must be int32");
-        }
        return {shape::int32_type, out_lens};
    }
 };

--- a/src/include/migraphx/op/rnn_variable_seq_lens.hpp
+++ b/src/include/migraphx/op/rnn_variable_seq_lens.hpp
@@ -37,7 +37,7 @@ struct rnn_var_sl_shift_output
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        int64_t max_len = static_cast<int64_t>(output_shape.lens()[0]);
+        int64_t max_len = output_shape.lens()[0];
        visit_all(result, args[0])([&](auto output, auto input) {
            using value_type = typename decltype(output)::value_type;
            args[1].visit([&](auto seq_lens) {
@@ -76,7 +76,7 @@ struct rnn_var_sl_shift_sequence
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        int64_t max_len = static_cast<int64_t>(output_shape.lens()[0]);
+        int64_t max_len = output_shape.lens()[0];
        visit_all(result, args[0])([&](auto output, auto input) {
            using value_type = typename decltype(output)::value_type;
            args[1].visit([&](auto seq_lens) {

--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+#include <limits>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <cmath>
+#include <numeric>
+#include <utility>
+#include <vector>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct roialign
+{
+    std::string coord_trans_mode = "half_pixel";
+    std::string mode             = "avg";
+    int64_t output_height        = 1;
+    int64_t output_width         = 1;
+    int64_t sampling_ratio       = 0;
+    float spatial_scale          = 1.0f;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.coord_trans_mode, "coordinate_transformation_mode"),
+                    f(self.mode, "mode"),
+                    f(self.output_height, "output_height"),
+                    f(self.output_width, "output_width"),
+                    f(self.sampling_ratio, "sampling_ratio"),
+                    f(self.spatial_scale, "spatial_scale"));
+    }
+    std::string name() const { return "roialign"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3).standard();
+        auto x_lens   = inputs.at(0).lens();
+        auto roi_lens = inputs.at(1).lens();
+        auto bi_lens  = inputs.at(2).lens();
+        auto type     = inputs.at(0).type();
+        // check input correct
+        if(bi_lens.size() != 1)
+        {
+            MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
+        }
+        if(roi_lens.size() != 2 or roi_lens.at(1) != 4)
+        {
+            MIGRAPHX_THROW(
+                "ROIALIGN: rois should be 2 dimensions, and the second dim should be 4!");
+        }
+        if(roi_lens.front() != bi_lens.front())
+        {
+            MIGRAPHX_THROW("ROIALIGN: rois and batch indices inputs should have the same number!");
+        }
+        std::vector<std::size_t> out_lens = x_lens;
+        out_lens[0]                       = roi_lens[0];
+        out_lens[2]                       = output_height;
+        out_lens[3]                       = output_width;
+        return {type, out_lens};
+    }
+    struct pos_weight
+    {
+        // neighbor indices for the bilinear interpolation
+        std::array<std::size_t, 4> pos = {0, 0, 0, 0};
+        // neighbor weights for the bilinear interpolation
+        std::array<float, 4> w = {0.0f, 0.0f, 0.0f, 0.0f};
+    };
+    auto calc_pos_weight(const std::array<std::size_t, 2>& dims,
+                         const shape& comp_s,
+                         const std::array<float, 2>& roi_start,
+                         const std::array<float, 2>& bin_size,
+                         const std::array<std::size_t, 2>& bin_grid_size) const
+    {
+        std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
+                                        output_width);
+        shape_for_each(comp_s, [&](auto idx) {
+            std::array<std::size_t, 2> p = {idx[0], idx[1]};
+            std::array<std::size_t, 2> i = {idx[2], idx[3]};
+            auto index                   = comp_s.index(idx);
+            std::array<float, 2> xy{};
+            std::array<int64_t, 2> low{};
+            std::array<int64_t, 2> high{};
+            for(auto ii : range(p.size()))
+            {
+                xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
+                         (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
+                xy[ii] = (coord_trans_mode == "output_half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+                if(xy[ii] < -1.0 or xy[ii] > dims[ii])
+                {
+                    results[index] = pos_weight{};
+                    return;
+                }
+                xy[ii]   = std::max(xy[ii], 0.0f);
+                low[ii]  = xy[ii];
+                high[ii] = low[ii] + 1;
+                if(low[ii] >= dims[ii] - 1)
+                {
+                    xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+                }
+            }
+            results[index].pos = {low[0] * dims[1] + low[1],
+                                  low[0] * dims[1] + high[1],
+                                  high[0] * dims[1] + low[1],
+                                  high[0] * dims[1] + high[1]};
+            float ly = xy[0] - low[0];
+            float lx = xy[1] - low[1];
+            float hy = 1.0f - ly;
+            float hx = 1.0f - lx;
+            // save weights and indeces
+            results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
+        });
+        return results;
+    }
+    struct max_pool
+    {
+        double init() { return std::numeric_limits<double>::lowest(); }
+        double operator()(double x, double y) { return std::max(x, y); }
+        double final(double x, std::size_t) { return (x); }
+    };
+    struct avg_pool
+    {
+        double init() { return 0.0; }
+        double operator()(double x, double y) { return x + y; }
+        double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
+    };
+    template <class T, class Op>
+    std::tuple<double, int64_t> calc_pooling(const T& data,
+                                             const std::array<std::size_t, 2>& bin_grid_size,
+                                             const std::vector<pos_weight>& pos_weights,
+                                             int64_t index,
+                                             Op op) const
+    {
+        double output_val   = op.init();
+        const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+        dfor(bin_grid_size[0], bin_grid_size[1])([&](auto, auto) {
+            const auto& pc = pos_weights[index];
+            std::array<double, 4> wv;
+            std::transform(
+                pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
+                    return *(data + pos) * w;
+                });
+            output_val = std::accumulate(wv.begin(), wv.end(), output_val, op);
+            index += 1;
+        });
+        output_val = op.final(output_val, count);
+        return {output_val, index};
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        const auto& out_lens = output_shape.lens();
+        int64_t n_rois       = out_lens[0];
+        std::size_t channels = out_lens[1];
+        // output dims of height and width, in all 2-dim arrays, the first dim
+        // is for height and second dim is for width
+        std::array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+        const auto& x_lens                  = args.at(0).get_shape().lens();
+        // input dims of height and width
+        std::array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+        auto roi_s                         = args.at(1).get_shape();
+        visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
+            const auto* batch_indices = args.at(2).cast<int64_t>();
+            par_for(n_rois, [&](auto n) {
+                const auto bottom_data   = x.begin();
+                const auto roi_batch_ind = batch_indices[n];
+                // Do not using rounding; this implementation detail is critical
+                std::array<float, 2> roi_starts = {
+                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale)};
+                std::array<float, 2> roi_ends = {
+                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale)};
+                // Force malformed ROIs to be 1x1
+                std::array<float, 2> roi_size{};
+                std::array<float, 2> bin_size{};
+                std::array<std::size_t, 2> bin_grid_size{};
+                for(auto ii : range(roi_size.size()))
+                {
+                    roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+                    roi_size[ii] = std::max(roi_size[ii], 1.0f);
+                    bin_size[ii]      = roi_size[ii] / out_dims[ii];
+                    bin_grid_size[ii] = (sampling_ratio > 0)
+                                            ? sampling_ratio
+                                            : std::ceil(roi_size[ii] / out_dims[ii]);
+                }
+                // we want to precalculate indices and weights shared by all channels,
+                // this is the key point of optimization
+                std::vector<std::size_t> comp_lens = {
+                    out_dims[0], out_dims[1], bin_grid_size[0], bin_grid_size[1]};
+                shape comp_s{shape::float_type, comp_lens};
+                auto pre_calc =
+                    this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
+                std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
+                shape comp_s1{migraphx::shape::float_type, comp_lens1};
+                std::vector<int64_t> vec_index(channels, 0);
+                shape_for_each(comp_s1, [&](auto idx) {
+                    auto c  = idx[0];
+                    auto ph = idx[1];
+                    auto pw = idx[2];
+                    const auto offset_bottom_data =
+                        bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
+                                                           in_dims[0] * in_dims[1]);
+                    double output_val;
+                    std::tie(output_val, vec_index[c]) =
+                        (mode == "avg") ? this->calc_pooling(offset_bottom_data,
+                                                             bin_grid_size,
+                                                             pre_calc,
+                                                             vec_index[c],
+                                                             avg_pool{})
+                                        : this->calc_pooling(offset_bottom_data,
+                                                             bin_grid_size,
+                                                             pre_calc,
+                                                             vec_index[c],
+                                                             max_pool{});
+                    output(n, c, ph, pw) = output_val;
+                });
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/topk.hpp
+++ b/src/include/migraphx/op/topk.hpp
@@ -45,7 +45,7 @@ struct topk
        shape s_val{type, lens};
        shape s_ind{shape::int64_type, lens};
-        return shape({s_val, s_ind});
+        return {{s_val, s_ind}};
    }
    template <class T, class Compare>
@@ -131,7 +131,7 @@ struct topk
            });
        });
-        return argument({res_val, res_ind});
+        return {{res_val, res_ind}};
    }
 };

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -57,6 +57,8 @@
 #include <migraphx/op/mul.hpp>
 #include <migraphx/op/multibroadcast.hpp>
 #include <migraphx/op/neg.hpp>
+#include <migraphx/op/nonmaxsuppression.hpp>
+#include <migraphx/op/nonzero.hpp>
 #include <migraphx/op/outline.hpp>
 #include <migraphx/op/pad.hpp>
 #include <migraphx/op/pooling.hpp>
@@ -79,6 +81,7 @@
 #include <migraphx/op/rnn_last_hs_output.hpp>
 #include <migraphx/op/rnn_variable_seq_lens.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/op/roialign.hpp>
 #include <migraphx/op/round.hpp>
 #include <migraphx/op/rsqrt.hpp>
 #include <migraphx/op/scalar.hpp>

--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -23,6 +23,8 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_EVAL)
 struct program_impl;
+struct marker;
 /**
 * @brief Stores the instruction stream
 */
@@ -65,7 +67,10 @@ struct program
    void finalize();
-    void perf_report(std::ostream& os, std::size_t n, parameter_map params) const;
+    void
+    perf_report(std::ostream& os, std::size_t n, parameter_map params, std::size_t batch = 1) const;
+    void mark(const parameter_map& params, marker&& m);
    value to_value() const;
    void from_value(const value& v);

--- a/src/include/migraphx/run_loop.hpp
+++ b/src/include/migraphx/run_loop.hpp
@@ -106,7 +106,7 @@ argument run_loop(const LoopModel& model,
    std::copy(in_args.begin() + 2, in_args.end(), out_args.begin());
    model.set_zero(ctx, scan_outputs, iter);
-    return argument(out_args);
+    return {out_args};
 }
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/stringutils.hpp
+++ b/src/include/migraphx/stringutils.hpp
@@ -71,7 +71,7 @@ std::string trim(const std::string& s, F f)
 {
    auto start = std::find_if_not(s.begin(), s.end(), f);
    auto last  = std::find_if_not(s.rbegin(), std::string::const_reverse_iterator(start), f).base();
-    return std::string(start, last);
+    return {start, last};
 }
 inline std::string trim(const std::string& s)

--- a/src/inline_module.cpp
+++ b/src/inline_module.cpp
@@ -11,49 +11,8 @@ inline namespace MIGRAPHX_INLINE_NS {
 static void inline_submodule(module& m, instruction_ref ins, bool cond)
 {
    const auto& mod_inputs = ins->module_inputs();
-    const auto* smod       = cond ? mod_inputs.at(0) : mod_inputs.at(1);
+    module_ref smod        = cond ? mod_inputs.at(0) : mod_inputs.at(1);
+    auto mod_outputs       = m.insert_module_instructions(ins, smod);
-    std::unordered_map<instruction_ref, instruction_ref> map_ins;
-    std::vector<instruction_ref> mod_outputs;
-    for(auto sins : iterator_for(*smod))
-    {
-        instruction_ref copy_ins{};
-        if(sins->name() == "@literal")
-        {
-            auto l   = sins->get_literal();
-            copy_ins = m.add_literal(l);
-        }
-        else if(sins->name() == "@param")
-        {
-            auto&& name = any_cast<builtin::param>(sins->get_operator()).parameter;
-            auto s      = sins->get_shape();
-            copy_ins    = m.add_parameter(name, s);
-        }
-        else if(sins->name() == "@outline")
-        {
-            auto s   = sins->get_shape();
-            copy_ins = m.add_outline(s);
-        }
-        else
-        {
-            auto mod_args = sins->module_inputs();
-            auto inputs   = sins->inputs();
-            std::vector<instruction_ref> copy_inputs(inputs.size());
-            std::transform(inputs.begin(), inputs.end(), copy_inputs.begin(), [&](auto i) {
-                return contains(map_ins, i) ? map_ins[i] : i;
-            });
-            if(sins->name() == "@return")
-            {
-                mod_outputs = copy_inputs;
-                break;
-            }
-            copy_ins = m.insert_instruction(ins, sins->get_operator(), copy_inputs, mod_args);
-        }
-        map_ins[sins] = copy_ins;
-        mod_outputs   = {copy_ins};
-    }
    auto ins_outputs = ins->outputs();
    assert(mod_outputs.size() >= ins_outputs.size());

--- a/src/instruction.cpp
+++ b/src/instruction.cpp
@@ -468,5 +468,11 @@ std::vector<shape> try_compute_shape(const operation& op, const std::vector<shap
    }
    return {new_shape};
 }
+migraphx::instruction* as_address(const instruction_ref& ins) noexcept
+{
+    return std::addressof(*ins);
+}
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/module.cpp
+++ b/src/module.cpp
+#include <iterator>
 #include <migraphx/module.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/instruction.hpp>
@@ -302,6 +303,55 @@ instruction_ref module::move_instructions(instruction_ref src, instruction_ref d
    return src;
 }
+std::vector<instruction_ref> module::insert_module_instructions(
+    instruction_ref ins, module_ref m, std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    std::vector<instruction_ref> mod_outputs;
+    for(auto sins : iterator_for(*m))
+    {
+        if(contains(map_ins, sins))
+            continue;
+        instruction_ref copy_ins;
+        if(sins->name() == "@literal")
+        {
+            auto l   = sins->get_literal();
+            copy_ins = this->add_literal(l);
+        }
+        else if(sins->name() == "@param")
+        {
+            auto&& name = any_cast<builtin::param>(sins->get_operator()).parameter;
+            auto s      = sins->get_shape();
+            copy_ins    = this->add_parameter(name, s);
+        }
+        else if(sins->name() == "@outline")
+        {
+            auto s   = sins->get_shape();
+            copy_ins = this->add_outline(s);
+        }
+        else
+        {
+            auto mod_args = sins->module_inputs();
+            auto inputs   = sins->inputs();
+            std::vector<instruction_ref> copy_inputs(inputs.size());
+            std::transform(inputs.begin(), inputs.end(), copy_inputs.begin(), [&](auto i) {
+                return contains(map_ins, i) ? map_ins[i] : i;
+            });
+            if(sins->name() == "@return")
+            {
+                mod_outputs = copy_inputs;
+                break;
+            }
+            copy_ins = this->insert_instruction(ins, sins->get_operator(), copy_inputs, mod_args);
+        }
+        map_ins[sins] = copy_ins;
+    }
+    if(mod_outputs.empty())
+        mod_outputs = {map_ins.at(std::prev(m->end()))};
+    return mod_outputs;
+}
 instruction_ref module::add_literal(literal l)
 {
    impl->emplace_front(std::move(l));
@@ -332,6 +382,20 @@ instruction_ref module::add_return(std::vector<instruction_ref> args)
    return result;
 }
+instruction_ref module::replace_return(std::vector<instruction_ref> args)
+{
+    auto last = std::prev(this->end());
+    // If there is no return then add a return
+    if(last->name() != "@return")
+        return this->add_return(args);
+    shape r = compute_shape(last->get_operator(), args);
+    instruction::replace(last, last->get_operator(), r, std::move(args));
+    assert(last->valid(begin()));
+    return last;
+}
 shape module::get_parameter_shape(std::string name) const
 {
    auto ins = std::find_if(

--- a/src/normalize_attributes.cpp
+++ b/src/normalize_attributes.cpp
@@ -20,7 +20,7 @@ auto tune_attribute(const std::vector<int64_t>& vec,
                    const std::vector<std::size_t>& lens)
 {
    std::vector<int64_t> result(vec);
-    int64_t n_rank                                 = static_cast<int64_t>(lens.size());
+    int64_t n_rank                                 = lens.size();
    std::vector<op::normalize_attribute> vec_attrs = val.to_vector<op::normalize_attribute>();
    if(contains(vec_attrs, op::normalize_attribute::use_output))
    {

--- a/src/onnx/parse_gather_elements.cpp
+++ b/src/onnx/parse_gather_elements.cpp
@@ -39,7 +39,7 @@ struct parse_gather_elements : op_parser<parse_gather_elements>
        int tuned_axis = tune_axis(n_rank, axis, opd.op_name);
        auto axis_stride      = data_s.strides()[tuned_axis];
-        int64_t data_elem_num = static_cast<int64_t>(data_s.elements());
+        int64_t data_elem_num = data_s.elements();
        // reshape the input data as one dimension and used as input data
        // to the gather operator
        arg_data = info.add_instruction(make_op("reshape", {{"dims", {data_elem_num}}}), arg_data);

--- a/src/onnx/parse_gemm.cpp
+++ b/src/onnx/parse_gemm.cpp
@@ -61,7 +61,7 @@ struct parse_gemm : op_parser<parse_gemm>
                      ? info.add_instruction(make_op("transpose", {{"permutation", perm}}), args[1])
                      : args[1];
-        auto ret = info.add_instruction(make_op("dot", {{"alpha", 1.0f}, {"beta", 0.0f}}), l1, l2);
+        auto ret = info.add_instruction(make_op("dot"), l1, l2);
        if(args.size() == 3)
        {