Merge branch 'develop' into mlir-c

df032e06 · Paul · cf4642cd · 19f65e7e · df032e06 · df032e06
Commit df032e06 authored Nov 16, 2021 by Paul
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,7 +6,7 @@ ARG PREFIX=/usr/local
 RUN dpkg --add-architecture i386

 # Add rocm repository
-RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/4.2/ xenial main > /etc/apt/sources.list.d/rocm.list'
+RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/4.5/ ubuntu main > /etc/apt/sources.list.d/rocm.list'

 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@@ -32,6 +32,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    software-properties-common \
    wget \
    rocm-device-libs \
+    hip-base \
+    libnuma-dev \
    miopen-hip \
    rocblas \
    zlib1g-dev && \

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -20,7 +20,7 @@ def rocmtestnode(Map conf) {
            rm -rf build
            mkdir build
            cd build
-            CXX=${compiler} CXXFLAGS='-Werror -Wno-fallback' cmake -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache ${flags} .. 
+            CXX=${compiler} CXXFLAGS='-Werror' cmake -DCMAKE_C_COMPILER_LAUNCHER=ccache -DCMAKE_CXX_COMPILER_LAUNCHER=ccache ${flags} .. 
            make -j\$(nproc) generate all doc package check VERBOSE=1
        """
        echo cmd
@@ -75,6 +75,8 @@ def rocmnodename(name) {
        node_name = "${rocmtest_name} && fiji";
    } else if(name == "vega") {
        node_name = "${rocmtest_name} && vega";
+    } else if(name == "navi21") {
+        node_name = "${rocmtest_name} && navi21";
    } else if(name == "nogpu") {
        return rocmtest_name;
    }
@@ -110,6 +112,10 @@ rocmtest clang_debug: rocmnode('vega') { cmake_build ->
        def debug_flags = "-g -O2 -fno-omit-frame-pointer -fsanitize=${sanitizers} -fno-sanitize-recover=${sanitizers}"
        cmake_build("/opt/rocm/llvm/bin/clang++", "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_GPU=Off -DMIGRAPHX_ENABLE_CPU=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags}'")
    }
+}, clang_release_navi: rocmnode('navi21') { cmake_build ->
+    stage('HIP Clang Release Navi') {
+        cmake_build("/opt/rocm/llvm/bin/clang++", "-DCMAKE_BUILD_TYPE=release")
+    }
 }

 def onnxnode(name, body) {

--- a/dev-requirements.txt
+++ b/dev-requirements.txt
 pfultz2/rocm-recipes
 facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cmake
 ccache@v4.1
+pcre,pfultz2/pcre@8.45 -H sha256:d6f7182602a775a7d500a0cedca6449af0400c6493951513046d17615ed0bf11
 danmar/cppcheck@2.6 -DHAVE_RULES=1
 RadeonOpenCompute/rocm-cmake@ececd2eccae4d01e7ec154efe90ac43ebf4df317 --build
 -f requirements.txt
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
-sphinx==2.2.2
-breathe==4.13.1
+docutils==0.17.1
+sphinx==4.2.0
+breathe==4.31.0
+sphinx_rtd_theme==1.0.0
 # git+https://github.com/arximboldi/breathe@fix-node-parent
--- a/doc/src/conf.py
+++ b/doc/src/conf.py
@@ -18,6 +18,8 @@
 #
 # import os
 # import sys
+from datetime import date
+import re
 # sys.path.insert(0, os.path.abspath('.'))

 # -- General configuration ------------------------------------------------
@@ -29,7 +31,9 @@
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = ['breathe', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode']
+extensions = [
+    'breathe', 'sphinx.ext.mathjax', 'sphinx.ext.viewcode', 'sphinx_rtd_theme'
+]

 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -45,7 +49,7 @@ master_doc = 'index'

 # General information about the project.
 project = u'MIGraphX'
-copyright = u'2018, AMD'
+copyright = u'2018-{}, AMD'.format(date.today().year)
 author = u'AMD'

 # The version info for the project you're documenting, acts as replacement for
@@ -53,9 +57,12 @@ author = u'AMD'
 # built documents.
 #
 # The short X.Y version.
-version = u'0.1'
+with open('../../CMakeLists.txt') as file:
+    version = next((re.findall('[0-9.]+', line)[0]
+                    for line in file.readlines()
+                    if 'rocm_setup_version' in line))
 # The full version, including alpha/beta/rc tags.
-release = u'0.1'
+release = version

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -82,7 +89,7 @@ todo_include_todos = False
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'

 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the

--- a/hip-clang.docker
+++ b/hip-clang.docker
@@ -6,7 +6,7 @@ ARG PREFIX=/usr/local
 RUN dpkg --add-architecture i386

 # Add rocm repository
-RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/4.2/ xenial main > /etc/apt/sources.list.d/rocm.list'
+RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/4.5/ ubuntu main > /etc/apt/sources.list.d/rocm.list'

 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@@ -29,6 +29,8 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    software-properties-common \
    wget \
    rocm-device-libs \
+    hip-base \
+    libnuma-dev \
    miopen-hip \
    rocblas \
    zlib1g-dev && \

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -131,6 +131,7 @@ register_migraphx_ops(
    multibroadcast
    multinomial
    neg
+    nonmaxsuppression
    nonzero
    outline
    pad
@@ -155,6 +156,7 @@ register_migraphx_ops(
    rnn_last_cell_output
    rnn_last_hs_output
    rnn_var_sl_last_output
+    roialign
    round
    rsqrt
    scalar

--- a/src/api/include/migraphx/migraphx.hpp
+++ b/src/api/include/migraphx/migraphx.hpp
@@ -252,7 +252,7 @@ struct shape : MIGRAPHX_CONST_HANDLE_BASE(shape)
        const size_t* pout;
        size_t pout_size;
        call(&migraphx_shape_lengths, &pout, &pout_size, this->get_handle_ptr());
-        return std::vector<size_t>(pout, pout + pout_size);
+        return {pout, pout + pout_size};
    }

    std::vector<size_t> strides() const
@@ -260,7 +260,7 @@ struct shape : MIGRAPHX_CONST_HANDLE_BASE(shape)
        const size_t* pout;
        size_t pout_size;
        call(&migraphx_shape_strides, &pout, &pout_size, this->get_handle_ptr());
-        return std::vector<size_t>(pout, pout + pout_size);
+        return {pout, pout + pout_size};
    }

    migraphx_shape_datatype_t type() const
@@ -312,7 +312,7 @@ struct argument : MIGRAPHX_CONST_HANDLE_BASE(argument)
    {
        const_migraphx_shape_t pout;
        call(&migraphx_argument_shape, &pout, this->get_handle_ptr());
-        return shape(pout);
+        return {pout};
    }

    char* data() const
@@ -325,9 +325,8 @@ struct argument : MIGRAPHX_CONST_HANDLE_BASE(argument)
    /// Generate an argument using random data
    static argument generate(shape ps, size_t pseed = 0)
    {
-        return argument(
-            make<migraphx_argument>(&migraphx_argument_generate, ps.get_handle_ptr(), pseed),
-            own{});
+        return {make<migraphx_argument>(&migraphx_argument_generate, ps.get_handle_ptr(), pseed),
+                own{}};
    }

    friend bool operator==(const argument& px, const argument& py)
@@ -378,7 +377,7 @@ struct program_parameter_shapes : MIGRAPHX_HANDLE_BASE(program_parameter_shapes)
    {
        const_migraphx_shape_t pout;
        call(&migraphx_program_parameter_shapes_get, &pout, this->get_handle_ptr(), pname);
-        return shape(pout);
+        return {pout};
    }

    std::vector<const char*> names() const
@@ -438,7 +437,7 @@ struct arguments : MIGRAPHX_HANDLE_BASE(arguments), array_base<arguments>
    {
        const_migraphx_argument_t pout;
        call(&migraphx_arguments_get, &pout, this->get_handle_ptr(), pidx);
-        return argument(pout);
+        return {pout};
    }

    struct iterator_read
@@ -449,7 +448,7 @@ struct arguments : MIGRAPHX_HANDLE_BASE(arguments), array_base<arguments>
            const_migraphx_argument_t pout;
            call(&migraphx_arguments_get, &pout, self, pidx);

-            return argument(pout);
+            return {pout};
        }
    };
 };
@@ -471,7 +470,7 @@ struct shapes : MIGRAPHX_HANDLE_BASE(shapes), array_base<shapes>
    {
        const_migraphx_shape_t pout;
        call(&migraphx_shapes_get, &pout, this->get_handle_ptr(), pidx);
-        return shape(pout);
+        return {pout};
    }

    struct iterator_read
@@ -481,7 +480,7 @@ struct shapes : MIGRAPHX_HANDLE_BASE(shapes), array_base<shapes>
        {
            const_migraphx_shape_t pout;
            call(&migraphx_shapes_get, &pout, self, pidx);
-            return shape(pout);
+            return {pout};
        }
    };
 };
@@ -609,7 +608,7 @@ struct operation : MIGRAPHX_HANDLE_BASE(operation)
    {
        std::array<char, 1024> out_name;
        call(&migraphx_operation_name, out_name.data(), 1024, this->get_handle_ptr());
-        return std::string(out_name.data());
+        return {out_name.data()};
    }
 };


--- a/src/cpp_generator.cpp
+++ b/src/cpp_generator.cpp
@@ -26,16 +26,18 @@ cpp_generator::function::set_body(const module& m, const cpp_generator::generate
        {
            names[ins] =
                migraphx::any_cast<migraphx::builtin::param>(ins->get_operator()).parameter;
-            continue;
        }
-        if(ins->name() == "@return")
+        else if(ins->name() == "@return")
        {
            assert(ins->inputs().size() == 1);
            return_ins = ins->inputs().front();
        }
-        std::string n = "z" + std::to_string(names.size());
-        names[ins]    = n;
-        ss << "auto " << n << " = " << g(ins, names) << ";\n";
+        else
+        {
+            std::string n = "z" + std::to_string(names.size());
+            names[ins]    = n;
+            ss << "auto " << n << " = " << g(ins, names) << ";\n";
+        }
    }
    ss << "return " << names.at(return_ins) << ";\n";
    body = ss.str();
@@ -84,8 +86,11 @@ void cpp_generator::fmap(const std::function<std::string(std::string)>& f) { imp
 std::string cpp_generator::generate_point_op(const operation& op,
                                             const std::vector<std::string>& args)
 {
-    auto v = op.to_value();
-    return interpolate_string(op.attributes()["point_op"].to<std::string>(),
+    auto v          = op.to_value();
+    auto attributes = op.attributes();
+    if(not attributes.contains("point_op"))
+        MIGRAPHX_THROW("op is missing point_op attribute: " + op.name());
+    return interpolate_string(attributes["point_op"].to<std::string>(),
                              [&](auto start, auto last) -> std::string {
                                  auto key = trim({start, last});
                                  if(key.empty())
@@ -120,7 +125,12 @@ std::string cpp_generator::str() const { return impl->fs.str(); }
 cpp_generator::function cpp_generator::generate_module(const module& m)
 {
    function f;
-    f.set_name(m.name()).set_types(m).set_body(
+    auto name = transform_string(m.name(), [](char c) {
+        if(with_char(::isalnum)(c) or c == '_')
+            return c;
+        return '_';
+    });
+    f.set_name(name).set_types(m).set_body(
        m, [&](instruction_ref ins, const auto& names) -> std::string {
            if(ins->name() == "@literal")
                return shape::cpp_type(ins->get_shape().type()) + "(" +
@@ -130,7 +140,6 @@ cpp_generator::function cpp_generator::generate_module(const module& m)
                           ins->inputs().end(),
                           std::back_inserter(args),
                           [&](auto i) { return names.at(i); });
-            auto s = this->generate_point_op(ins->get_operator(), args);
            return this->generate_point_op(ins->get_operator(), args);
        });
    return f;

--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -480,7 +480,7 @@ struct perf : command<perf>
        std::cout << "Allocating params ... " << std::endl;
        auto m = c.params(p);
        std::cout << "Running performance report ... " << std::endl;
-        p.perf_report(std::cout, n, m);
+        p.perf_report(std::cout, n, m, c.l.batch);
    }
 };


--- a/src/eliminate_data_type.cpp
+++ b/src/eliminate_data_type.cpp
@@ -11,7 +11,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 void eliminate_data_type::apply(module& m) const
 {
    static const std::vector<std::string> skip_op_names = {
-        "convert", "get_tuple_elem", "if", "loop"};
+        "convert", "get_tuple_elem", "if", "loop", "roialign"};
    for(auto ins : iterator_for(m))
    {
        if(ins->name()[0] == '@')

--- a/src/fuse_pointwise.cpp
+++ b/src/fuse_pointwise.cpp
@@ -13,6 +13,8 @@ inline namespace MIGRAPHX_INLINE_NS {

 static literal get_scalar(instruction_ref ins)
 {
+    if(ins->name() == "contiguous")
+        return get_scalar(ins->inputs().front());
    const auto& s = ins->get_shape();
    if(not(s.elements() == 1 or s.scalar()))
        return {};
@@ -31,11 +33,16 @@ static void create_pointwise_modules(module_pass_manager& mpm)
    {
        if(not ins->get_operator().attributes().get("pointwise", false))
            continue;
-        auto* pm = mpm.create_module("pointwise" + std::to_string(n++));
+        // Skip convert op for now
+        if(ins->name() == "convert")
+            continue;
+        assert(ins->get_operator().attributes().contains("point_op"));
+        auto* pm = mpm.create_module(mpm.get_module().name() + ":pointwise" + std::to_string(n++));
        pm->set_bypass();

        std::unordered_map<instruction_ref, instruction_ref> param_map;
        std::vector<instruction_ref> pointwise_inputs;
+        std::size_t i = 0;
        for(auto input : ins->inputs())
        {
            if(contains(param_map, input))
@@ -44,8 +51,9 @@ static void create_pointwise_modules(module_pass_manager& mpm)
            if(scalar.empty())
            {
                pointwise_inputs.push_back(input);
-                param_map[input] = pm->add_parameter("x" + std::to_string(param_map.size()),
-                                                     shape{input->get_shape().type()});
+                param_map[input] =
+                    pm->add_parameter("x" + std::to_string(i), shape{input->get_shape().type()});
+                i++;
            }
            else
            {
@@ -68,6 +76,7 @@ static void create_pointwise_modules(module_pass_manager& mpm)
 static std::vector<instruction_ref> append_pointwise_module(instruction_ref ins,
                                                            instruction_ref output)
 {
+    assert(contains(output->inputs(), ins));
    module_ref pm = ins->module_inputs().at(0);
    module_ref xm = output->module_inputs().at(0);

@@ -75,14 +84,18 @@ static std::vector<instruction_ref> append_pointwise_module(instruction_ref ins,
    assert(last->name() == "@return");
    assert(last->inputs().size() == 1);

+    assert(pm->get_parameter_names().size() == ins->inputs().size());
+    assert(xm->get_parameter_names().size() == output->inputs().size());
+
    std::vector<instruction_ref> inputs = ins->inputs();
    std::unordered_map<instruction_ref, instruction_ref> map_ins;
    std::unordered_map<instruction_ref, instruction_ref> input_map;
    // Copy inputs to input_map
    for(auto i : range(inputs.size()))
    {
-        auto input       = inputs[i];
-        auto param       = pm->get_parameter("x" + std::to_string(i));
+        auto input = inputs[i];
+        auto param = pm->get_parameter("x" + std::to_string(i));
+        assert(param != pm->end());
        input_map[input] = param;
    }
    // Add the new parameter and additional inputs
@@ -90,6 +103,7 @@ static std::vector<instruction_ref> append_pointwise_module(instruction_ref ins,
    {
        auto input = output->inputs()[i];
        auto param = xm->get_parameter("x" + std::to_string(i));
+        assert(param != xm->end());
        if(input == ins)
        {
            map_ins[param]   = last->inputs().front();

--- a/src/include/migraphx/op/if_op.hpp
+++ b/src/include/migraphx/op/if_op.hpp
@@ -35,7 +35,7 @@ struct if_op
            MIGRAPHX_THROW("IF: output shapes of submodules must be the same.");
        }

-        return shape(out_shapes0);
+        return {out_shapes0};
    }

    argument compute(const shape&,

--- a/src/include/migraphx/op/loop.hpp
+++ b/src/include/migraphx/op/loop.hpp
@@ -54,7 +54,7 @@ struct loop
            ins_out_shapes.push_back({out_s.type(), lens});
        }

-        return shape(ins_out_shapes);
+        return {ins_out_shapes};
    }

    struct ref_loop

--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_NONMAXSUPPRESSION_HPP
+#define MIGRAPHX_GUARD_OPERATORS_NONMAXSUPPRESSION_HPP
+
+#include <cmath>
+#include <queue>
+#include <cstdint>
+#include <iterator>
+#include <migraphx/config.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/tensor_view.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/output_iterator.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct nonmaxsuppression
+{
+    bool center_point_box = false;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.center_point_box, "center_point_box"));
+    }
+
+    std::string name() const { return "nonmaxsuppression"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // requires at least 2 inputs
+        check_shapes{inputs, *this}.standard();
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.only_dims(3);
+        auto lens = inputs.front().lens();
+
+        // check input shape
+        if(lens[1] != inputs.at(1).lens()[2])
+        {
+            MIGRAPHX_THROW("NonMaxSuppression: dimension mismatch between first and second input!");
+        }
+
+        std::vector<int64_t> out_lens(2);
+        out_lens.at(0) = lens.at(1);
+        out_lens.at(1) = 3;
+        return {shape::int64_type, out_lens};
+    }
+
+    struct box
+    {
+        std::array<float, 2> x;
+        std::array<float, 2> y;
+
+        void sort()
+        {
+            std::sort(x.begin(), x.end());
+            std::sort(y.begin(), y.end());
+        }
+
+        std::array<float, 2>& operator[](std::size_t i) { return i == 0 ? x : y; }
+
+        float area() const
+        {
+            assert(std::is_sorted(x.begin(), x.end()));
+            assert(std::is_sorted(y.begin(), y.end()));
+            return (x[1] - x[0]) * (y[1] - y[0]);
+        }
+    };
+
+    template <class T>
+    box batch_box(const T* boxes, std::size_t bidx) const
+    {
+        box result{};
+        const T* start = boxes + 4 * bidx;
+        if(center_point_box)
+        {
+            float half_width  = start[2] / 2.0f;
+            float half_height = start[3] / 2.0f;
+            float x_center    = start[0];
+            float y_center    = start[1];
+            result.x          = {x_center - half_width, x_center + half_width};
+            result.y          = {y_center - half_height, y_center + half_height};
+        }
+        else
+        {
+            result.x = {start[1], start[3]};
+            result.y = {start[0], start[2]};
+        }
+
+        return result;
+    }
+
+    inline bool suppress_by_iou(box b1, box b2, float iou_threshold) const
+    {
+        b1.sort();
+        b2.sort();
+
+        box intersection{};
+        for(auto i : range(2))
+        {
+            intersection[i][0] = std::max(b1[i][0], b2[i][0]);
+            intersection[i][1] = std::min(b1[i][1], b2[i][1]);
+        }
+
+        std::vector<std::array<float, 2>> bbox = {intersection.x, intersection.y};
+        if(std::any_of(bbox.begin(), bbox.end(), [](auto bx) {
+               return not std::is_sorted(bx.begin(), bx.end());
+           }))
+        {
+            return false;
+        }
+
+        const float area1             = b1.area();
+        const float area2             = b2.area();
+        const float intersection_area = intersection.area();
+        const float union_area        = area1 + area2 - intersection_area;
+
+        if(area1 <= .0f or area2 <= .0f or union_area <= .0f)
+        {
+            return false;
+        }
+
+        const float intersection_over_union = intersection_area / union_area;
+
+        return intersection_over_union > iou_threshold;
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+
+        result.visit([&](auto out) { std::fill(out.begin(), out.end(), 0); });
+
+        std::size_t max_output_boxes_per_class = 0;
+        float iou_threshold                    = 0.0f;
+        float score_threshold                  = 0.0f;
+
+        if(args.size() > 2)
+        {
+            max_output_boxes_per_class = args.at(2).at<std::size_t>();
+        }
+        // max_output_boxes_per_class is 0, no output
+        if(max_output_boxes_per_class == 0)
+        {
+            return result;
+        }
+
+        if(args.size() > 3)
+        {
+            iou_threshold = args.at(3).at<float>();
+        }
+
+        if(args.size() > 4)
+        {
+            score_threshold = args.at(4).at<float>();
+        }
+
+        const auto& lens = args.at(1).get_shape().lens();
+        auto batch_num   = lens[0];
+        auto class_num   = lens[1];
+        auto box_num     = args.at(0).get_shape().lens()[1];
+
+        std::vector<std::pair<float, int64_t>> selected_boxes_inside_class;
+        std::vector<int64_t> selected_indices;
+        selected_boxes_inside_class.reserve(output_shape.elements());
+
+        auto scores        = make_view<float>(args.at(1).get_shape(), args.at(1).cast<float>());
+        const float* boxes = args.at(0).cast<float>();
+        shape comp_s{shape::float_type, {batch_num, class_num}};
+        shape_for_each(comp_s, [&](auto idx) {
+            auto bidx = idx[0];
+            auto cidx = idx[1];
+
+            std::size_t score_offset = (bidx * class_num + cidx) * box_num;
+            const float* batch_boxes = boxes + bidx * box_num * 4;
+            std::priority_queue<std::pair<float, int64_t>> sorted_boxes;
+            auto insert_to_sorted_boxes =
+                make_function_output_iterator([&](const auto& x) { sorted_boxes.push(x); });
+
+            int64_t box_idx = 0;
+            transform_if(scores.begin() + score_offset,
+                         scores.begin() + score_offset + box_num,
+                         insert_to_sorted_boxes,
+                         [&](auto sc) {
+                             box_idx++;
+                             return sc >= score_threshold;
+                         },
+                         [&](auto sc) { return std::make_pair(sc, box_idx - 1); });
+
+            selected_boxes_inside_class.clear();
+            // Get the next box with top score, filter by iou_threshold
+            while(!sorted_boxes.empty() &&
+                  selected_boxes_inside_class.size() < max_output_boxes_per_class)
+            {
+                const std::pair<float, int64_t>& next_top_score = sorted_boxes.top();
+
+                // Check with existing selected boxes for this class, suppress if exceed the IOU
+                // (Intersection Over Union) threshold
+                bool not_selected = std::any_of(
+                    selected_boxes_inside_class.begin(),
+                    selected_boxes_inside_class.end(),
+                    [&](auto selected_index) {
+                        return this->suppress_by_iou(batch_box(batch_boxes, next_top_score.second),
+                                                     batch_box(batch_boxes, selected_index.second),
+                                                     iou_threshold);
+                    });
+
+                if(not not_selected)
+                {
+                    selected_boxes_inside_class.push_back(next_top_score);
+                    selected_indices.push_back(bidx);
+                    selected_indices.push_back(cidx);
+                    selected_indices.push_back(next_top_score.second);
+                }
+                sorted_boxes.pop();
+            }
+        });
+
+        result.visit([&](auto out) {
+            std::copy(selected_indices.begin(), selected_indices.end(), out.begin());
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/pointwise.hpp
+++ b/src/include/migraphx/op/pointwise.hpp
@@ -26,19 +26,17 @@ struct pointwise
        auto pnames = pm->get_parameter_names();
        std::sort(pnames.begin(), pnames.end());
        check_shapes{inputs, *this}.has(pnames.size()).same_dims();
-        for(auto i : range(pnames.size()))
-        {
-            auto s1 = pm->get_parameter(pnames[i])->get_shape();
-            auto s2 = inputs[i];
-            if(s1.type() != s2.type())
-                MIGRAPHX_THROW("Mismatch type");
-        }

        if(pm->get_output_shapes().size() != 1)
            MIGRAPHX_THROW("submodule should have only one output.");

        auto type = pm->get_output_shapes().front().type();

+        // Scalar output if all inputs are scalar
+        if(inputs.front().elements() == 1 and
+           all_of(inputs, [](const auto& s) { return s.scalar(); }))
+            return shape{type};
+
        return shape::from_permutation(type, inputs.front().lens(), find_permutation(inputs));
    }


--- a/src/include/migraphx/op/prelu.hpp
+++ b/src/include/migraphx/op/prelu.hpp
@@ -9,6 +9,7 @@ namespace op {

 struct prelu : binary<prelu>
 {
+    std::string point_op() const { return "(${0} < 0) ? (${0} * ${1}) : ${0}"; }
    auto apply() const
    {
        return [](auto x, auto slope) { return ((x < 0) ? (x * slope) : x); };

--- a/src/include/migraphx/op/recip.hpp
+++ b/src/include/migraphx/op/recip.hpp
@@ -9,6 +9,7 @@ namespace op {

 struct recip : unary<recip>
 {
+    std::string point_op() const { return "1 / ${0}"; }
    auto apply() const
    {
        return [](auto x) { return 1 / x; };

--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+
+#include <limits>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <cmath>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct roialign
+{
+    std::string coord_trans_mode = "half_pixel";
+    std::string mode             = "avg";
+    int64_t output_height        = 1;
+    int64_t output_width         = 1;
+    int64_t sampling_ratio       = 0;
+    float spatial_scale          = 1.0f;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.coord_trans_mode, "coordinate_transformation_mode"),
+                    f(self.mode, "mode"),
+                    f(self.output_height, "output_height"),
+                    f(self.output_width, "output_width"),
+                    f(self.sampling_ratio, "sampling_ratio"),
+                    f(self.spatial_scale, "spatial_scale"));
+    }
+
+    std::string name() const { return "roialign"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3).standard();
+        auto x_lens   = inputs.at(0).lens();
+        auto roi_lens = inputs.at(1).lens();
+        auto bi_lens  = inputs.at(2).lens();
+        auto type     = inputs.at(0).type();
+
+        // check input correct
+        if(bi_lens.size() != 1)
+        {
+            MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
+        }
+
+        if(roi_lens.size() != 2 or roi_lens.at(1) != 4)
+        {
+            MIGRAPHX_THROW(
+                "ROIALIGN: rois should be 2 dimensions, and the second dim should be 4!");
+        }
+
+        if(roi_lens.front() != bi_lens.front())
+        {
+            MIGRAPHX_THROW("ROIALIGN: rois and batch indices inputs should have the same number!");
+        }
+
+        std::vector<std::size_t> out_lens = x_lens;
+        out_lens[0]                       = roi_lens[0];
+        out_lens[2]                       = output_height;
+        out_lens[3]                       = output_width;
+
+        return {type, out_lens};
+    }
+
+    struct pos_weight
+    {
+        // neighbor indices for the bilinear interpolation
+        std::array<std::size_t, 4> pos = {0, 0, 0, 0};
+        // neighbor weights for the bilinear interpolation
+        std::array<float, 4> w = {0.0f, 0.0f, 0.0f, 0.0f};
+    };
+
+    auto calc_pos_weight(const std::array<std::size_t, 2>& dims,
+                         const shape& comp_s,
+                         const std::array<float, 2>& roi_start,
+                         const std::array<float, 2>& bin_size,
+                         const std::array<std::size_t, 2>& bin_grid_size) const
+    {
+        std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
+                                        output_width);
+        shape_for_each(comp_s, [&](auto idx) {
+            std::array<std::size_t, 2> p = {idx[0], idx[1]};
+            std::array<std::size_t, 2> i = {idx[2], idx[3]};
+            auto index                   = comp_s.index(idx);
+
+            std::array<float, 2> xy{};
+            std::array<int64_t, 2> low{};
+            std::array<int64_t, 2> high{};
+            for(auto ii : range(p.size()))
+            {
+                xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
+                         (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
+                xy[ii] = (coord_trans_mode == "output_half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+                if(xy[ii] < -1.0 or xy[ii] > dims[ii])
+                {
+                    results[index] = pos_weight{};
+                    return;
+                }
+
+                xy[ii]   = std::max(xy[ii], 0.0f);
+                low[ii]  = xy[ii];
+                high[ii] = low[ii] + 1;
+                if(low[ii] >= dims[ii] - 1)
+                {
+                    xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+                }
+            }
+
+            results[index].pos = {low[0] * dims[1] + low[1],
+                                  low[0] * dims[1] + high[1],
+                                  high[0] * dims[1] + low[1],
+                                  high[0] * dims[1] + high[1]};
+
+            float ly = xy[0] - low[0];
+            float lx = xy[1] - low[1];
+            float hy = 1.0f - ly;
+            float hx = 1.0f - lx;
+
+            // save weights and indeces
+            results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
+        });
+
+        return results;
+    }
+
+    struct max_pool
+    {
+        double init() { return std::numeric_limits<double>::lowest(); }
+
+        double operator()(double x, double y) { return std::max(x, y); }
+
+        double final(double x, std::size_t) { return (x); }
+    };
+
+    struct avg_pool
+    {
+        double init() { return 0.0; }
+
+        double operator()(double x, double y) { return x + y; }
+
+        double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
+    };
+
+    template <class T, class Op>
+    std::tuple<double, int64_t> calc_pooling(const T& data,
+                                             const std::array<std::size_t, 2>& bin_grid_size,
+                                             const std::vector<pos_weight>& pos_weights,
+                                             int64_t index,
+                                             Op op) const
+    {
+        double output_val   = op.init();
+        const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+        dfor(bin_grid_size[0], bin_grid_size[1])([&](auto, auto) {
+            const auto& pc = pos_weights[index];
+            std::array<double, 4> wv;
+            std::transform(
+                pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
+                    return *(data + pos) * w;
+                });
+            output_val = std::accumulate(wv.begin(), wv.end(), output_val, op);
+            index += 1;
+        });
+
+        output_val = op.final(output_val, count);
+
+        return {output_val, index};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        const auto& out_lens = output_shape.lens();
+        int64_t n_rois       = out_lens[0];
+        std::size_t channels = out_lens[1];
+        // output dims of height and width, in all 2-dim arrays, the first dim
+        // is for height and second dim is for width
+        std::array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+        const auto& x_lens                  = args.at(0).get_shape().lens();
+        // input dims of height and width
+        std::array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+        auto roi_s                         = args.at(1).get_shape();
+
+        visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
+            const auto* batch_indices = args.at(2).cast<int64_t>();
+            par_for(n_rois, [&](auto n) {
+                const auto bottom_data   = x.begin();
+                const auto roi_batch_ind = batch_indices[n];
+                // Do not using rounding; this implementation detail is critical
+                std::array<float, 2> roi_starts = {
+                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale)};
+                std::array<float, 2> roi_ends = {
+                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale)};
+
+                // Force malformed ROIs to be 1x1
+                std::array<float, 2> roi_size{};
+                std::array<float, 2> bin_size{};
+                std::array<std::size_t, 2> bin_grid_size{};
+
+                for(auto ii : range(roi_size.size()))
+                {
+                    roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+                    roi_size[ii] = std::max(roi_size[ii], 1.0f);
+
+                    bin_size[ii]      = roi_size[ii] / out_dims[ii];
+                    bin_grid_size[ii] = (sampling_ratio > 0)
+                                            ? sampling_ratio
+                                            : std::ceil(roi_size[ii] / out_dims[ii]);
+                }
+
+                // we want to precalculate indices and weights shared by all channels,
+                // this is the key point of optimization
+                std::vector<std::size_t> comp_lens = {
+                    out_dims[0], out_dims[1], bin_grid_size[0], bin_grid_size[1]};
+                shape comp_s{shape::float_type, comp_lens};
+                auto pre_calc =
+                    this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
+
+                std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
+                shape comp_s1{migraphx::shape::float_type, comp_lens1};
+                std::vector<int64_t> vec_index(channels, 0);
+                shape_for_each(comp_s1, [&](auto idx) {
+                    auto c  = idx[0];
+                    auto ph = idx[1];
+                    auto pw = idx[2];
+
+                    const auto offset_bottom_data =
+                        bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
+                                                           in_dims[0] * in_dims[1]);
+                    double output_val;
+                    std::tie(output_val, vec_index[c]) =
+                        (mode == "avg") ? this->calc_pooling(offset_bottom_data,
+                                                             bin_grid_size,
+                                                             pre_calc,
+                                                             vec_index[c],
+                                                             avg_pool{})
+                                        : this->calc_pooling(offset_bottom_data,
+                                                             bin_grid_size,
+                                                             pre_calc,
+                                                             vec_index[c],
+                                                             max_pool{});
+                    output(n, c, ph, pw) = output_val;
+                });
+
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/sigmoid.hpp
+++ b/src/include/migraphx/op/sigmoid.hpp
@@ -18,6 +18,7 @@ namespace op {

 struct sigmoid : unary<sigmoid>
 {
+    std::string point_op() const { return "1.f / (1.f + ${function:exp}(-${0}))"; }
    auto apply() const
    {
        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };