Merge branch 'develop' into type-string-driver

1b098fd7 · Paul Fultz II · GitHub · 05f2ee1c · c0398ded · 1b098fd7
Unverified Commit 1b098fd7 authored Jun 21, 2022 by Paul Fultz II Committed by GitHub Jun 21, 2022
20 changed files
--- a/src/include/migraphx/op/logical_xor.hpp
+++ b/src/include/migraphx/op/logical_xor.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_LOGICAL_XOR_HPP
+#define MIGRAPHX_GUARD_OPERATORS_LOGICAL_XOR_HPP
+
+#include <migraphx/op/binary.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct logical_xor : binary<logical_xor>
+{
+    std::string point_function() const { return "^"; }
+    auto apply() const
+    {
+        return [](auto x, auto y) { return static_cast<bool>(x) xor static_cast<bool>(y); };
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/logsoftmax.hpp
+++ b/src/include/migraphx/op/logsoftmax.hpp
 #ifndef MIGRAPHX_GUARD_OPERATORS_LOGSOFTMAX_HPP
 #define MIGRAPHX_GUARD_OPERATORS_LOGSOFTMAX_HPP

-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
 #include <migraphx/config.hpp>

 namespace migraphx {
@@ -11,7 +12,7 @@ namespace op {

 struct logsoftmax
 {
-    int axis = 1;
+    int64_t axis = 1;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -19,16 +20,25 @@ struct logsoftmax
        return pack(f(self.axis, "axis"));
    }

+    value attributes() const
+    {
+        value normalize;
+        normalize["axis"] = value::array{normalize_attribute::include_min};
+        return {{"normalize_axes", normalize}};
+    }
+
    std::string name() const { return "logsoftmax"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs}.has(1).standard();
-        if(axis < 0 || axis >= inputs[0].lens().size())
+        if(inputs.at(0).packed())
+        {
+            return inputs.at(0);
+        }
+        else
        {
-            MIGRAPHX_THROW("LogSoftMax: input axis value " + std::to_string(axis) +
-                           " is out of range");
+            auto lens = inputs.at(0).lens();
+            return {inputs.at(0).type(), lens};
        }
-        return inputs.at(0);
    }

    auto output() const

--- a/src/include/migraphx/op/loop.hpp
+++ b/src/include/migraphx/op/loop.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_LOOP_HPP
+#define MIGRAPHX_GUARD_OPERATORS_LOOP_HPP
+
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/functional.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/run_loop.hpp>
+#include <migraphx/ranges.hpp>
+#include <cmath>
+#include <string>
+#include <utility>
+#include <set>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct loop
+{
+    int64_t max_iterations = 10;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.max_iterations, "max_iterations"));
+    }
+
+    std::string name() const { return "loop"; }
+
+    shape compute_shape(const std::vector<shape>& inputs, std::vector<module_ref> mods) const
+    {
+        check_shapes{inputs, *this}.standard();
+        if(mods.size() != 1)
+        {
+            MIGRAPHX_THROW("LOOP: operator should have one submodule.");
+        }
+
+        const auto& mod     = mods.front();
+        auto mod_out_shapes = mod->get_output_shapes();
+        auto dep_param_num  = inputs.size() - 2;
+
+        // first item of the mod output shapes is condition used in loop,
+        // which is not needed to compute output shape
+        mod_out_shapes.erase(mod_out_shapes.begin());
+        std::vector<shape> ins_out_shapes(mod_out_shapes.begin(),
+                                          mod_out_shapes.begin() + dep_param_num);
+        mod_out_shapes.erase(mod_out_shapes.begin(), mod_out_shapes.begin() + dep_param_num);
+        for(const auto& out_s : mod_out_shapes)
+        {
+            auto lens = out_s.lens();
+            lens.insert(lens.begin(), max_iterations);
+            ins_out_shapes.push_back({out_s.type(), lens});
+        }
+
+        return {ins_out_shapes};
+    }
+
+    struct ref_loop
+    {
+        int64_t max_iterations = 0;
+
+        template <class T>
+        void copy(context&, const argument& src, T& dst) const
+        {
+            dst = *src.cast<T>();
+        }
+
+        template <class T>
+        void copy(context&, T src, const argument& dst) const
+        {
+            *dst.cast<T>() = src;
+        }
+
+        void append(const std::vector<argument>& iter_state,
+                    const std::vector<argument>& concatenated_outputs,
+                    int iter) const
+        {
+            assert(iter_state.size() == concatenated_outputs.size());
+            for(auto i : range(iter_state.size()))
+            {
+                const auto& iter_stat = iter_state.at(i);
+                const auto& scan_out  = concatenated_outputs.at(i);
+
+                auto* in_data        = iter_stat.data();
+                auto* out_data       = scan_out.data();
+                std::size_t out_size = iter_stat.get_shape().bytes();
+                assert((iter + 1) * out_size <= scan_out.get_shape().bytes());
+                std::copy(in_data, in_data + out_size, out_data + iter * out_size);
+            }
+        }
+
+        void set_zero(context&, const std::vector<argument>& concatenated_outputs, int iter) const
+        {
+            if(iter >= max_iterations)
+                return;
+
+            for(const auto& out : concatenated_outputs)
+            {
+                auto s    = out.get_shape();
+                auto size = s.bytes() / max_iterations;
+                std::fill(out.data() + iter * size, out.data() + max_iterations * size, 0);
+            }
+        }
+
+        std::unordered_map<std::string, int> get_output_params(const module&) const { return {}; }
+    };
+
+    argument compute(context& ctx,
+                     const shape& out_shape,
+                     const std::vector<argument>& args,
+                     const std::vector<module_ref>& mods,
+                     const std::function<std::vector<argument>(
+                         module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
+    {
+        // wrap up the arguments vector, so ref and gpu impl are the same
+        auto cpy_args = args;
+        bool in_cond  = args.at(1).at<bool>();
+        bool cond     = in_cond;
+        int64_t iter  = 0;
+        // insert iter and cond used in the loop
+        auto s_cond = args.at(1).get_shape();
+        auto s_iter = args.at(0).get_shape();
+        cpy_args.push_back({s_iter, &iter});
+        cpy_args.push_back({s_cond, &cond});
+        cpy_args.insert(cpy_args.end(), args.begin() + 2, args.end());
+
+        // add cond and mod outputs to the argument list
+        cpy_args.push_back(argument(s_cond));
+        cpy_args.push_back(argument(out_shape));
+
+        // run loop
+        return run_loop(ref_loop{max_iterations}, ctx, cpy_args, mods, run);
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/lrn.hpp
+++ b/src/include/migraphx/op/lrn.hpp
@@ -2,7 +2,6 @@
 #define MIGRAPHX_GUARD_OPERATORS_LRN_HPP

 #include <array>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>

--- a/src/include/migraphx/op/lstm.hpp
+++ b/src/include/migraphx/op/lstm.hpp
@@ -4,6 +4,8 @@
 #include <array>
 #include <migraphx/op/common.hpp>
 #include <migraphx/operation.hpp>
+#include <migraphx/op/sigmoid.hpp>
+#include <migraphx/op/tanh.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
@@ -31,6 +33,7 @@ struct lstm
        return pack(f(self.hidden_size, "hidden_size"),
                    f(self.actv_funcs, "actv_func"),
                    f(self.direction, "direction"),
+                    f(self.clip, "clip"),
                    f(self.input_forget, "input_forget"));
    }


--- a/src/include/migraphx/op/max.hpp
+++ b/src/include/migraphx/op/max.hpp
@@ -3,7 +3,6 @@

 #include <array>
 #include <migraphx/op/binary.hpp>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
@@ -19,6 +18,12 @@ namespace op {

 struct max : binary<max>
 {
+    value attributes() const
+    {
+        auto a           = base_attributes();
+        a["commutative"] = true;
+        return a;
+    }
    auto apply() const
    {
        return [](auto x, auto y) { return std::max(x, y); };

--- a/src/include/migraphx/op/min.hpp
+++ b/src/include/migraphx/op/min.hpp
@@ -3,7 +3,6 @@

 #include <array>
 #include <migraphx/op/binary.hpp>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
@@ -19,6 +18,12 @@ namespace op {

 struct min : binary<min>
 {
+    value attributes() const
+    {
+        auto a           = base_attributes();
+        a["commutative"] = true;
+        return a;
+    }
    auto apply() const
    {
        return [](auto x, auto y) { return std::min(x, y); };

--- a/src/include/migraphx/op/mul.hpp
+++ b/src/include/migraphx/op/mul.hpp
@@ -3,7 +3,6 @@

 #include <array>
 #include <migraphx/op/binary.hpp>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
@@ -19,6 +18,13 @@ namespace op {

 struct mul : binary<mul>
 {
+    value attributes() const
+    {
+        auto a           = base_attributes();
+        a["commutative"] = true;
+        return a;
+    }
+    std::string point_function() const { return "*"; }
    auto apply() const
    {
        return [](auto x, auto y) { return x * y; };

--- a/src/include/migraphx/op/multibroadcast.hpp
+++ b/src/include/migraphx/op/multibroadcast.hpp
@@ -2,13 +2,13 @@
 #define MIGRAPHX_GUARD_OPERATORS_MULTIBROADCAST_HPP

 #include <array>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/lifetime.hpp>
 #include <cmath>
 #include <utility>

@@ -23,7 +23,7 @@ struct multibroadcast
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.output_lens, "output_lens"));
+        return pack(f(self.output_lens, "out_lens"));
    }

    std::string name() const { return "multibroadcast"; }
@@ -67,7 +67,7 @@ struct multibroadcast
    }
    argument compute(shape output_shape, std::vector<argument> args) const
    {
-        return {std::move(output_shape), std::move(args.at(0).data)};
+        return args[0].reshape(output_shape);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };

--- a/src/include/migraphx/op/multinomial.hpp
+++ b/src/include/migraphx/op/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
+
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/par_for.hpp>
+#include <random>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct multinomial
+{
+    shape::type_t dtype = shape::type_t::int32_type;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.dtype, "dtype"));
+    }
+
+    std::string name() const { return "multinomial"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).only_dims(2);
+        size_t sample_size = inputs.back().lens().back();
+
+        if(not contains({shape::int32_type, shape::int64_type}, dtype))
+            MIGRAPHX_THROW(
+                "Multinomial: Invalid output type. Valid types are int32_type and int64_type.");
+
+        return {dtype, {inputs.front().lens().front(), sample_size}};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        size_t batch_size  = output_shape.lens().front();
+        size_t class_size  = args[0].get_shape().lens().back();
+        size_t sample_size = output_shape.lens().back();
+
+        visit_all(args[0], args[1])([&](auto cdf, auto dist) {
+            result.visit([&](auto output) {
+                par_for(batch_size * sample_size, [&](auto i) {
+                    auto idx       = args[1].get_shape().multi(i);
+                    auto cdf_begin = cdf.begin() + (idx[0] * class_size);
+                    auto cdf_end   = cdf_begin + class_size;
+                    auto sample_iter =
+                        std::upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    output[i] = std::distance(cdf_begin, sample_iter);
+                });
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/name.hpp
+++ b/src/include/migraphx/op/name.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_NAME_HPP
 #define MIGRAPHX_GUARD_RTGLIB_NAME_HPP

-#include <array>
-#include <migraphx/operation.hpp>
-#include <migraphx/check_shapes.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/streamutils.hpp>
-#include <migraphx/literal.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/type_name.hpp>
 #include <migraphx/config.hpp>
-#include <cmath>
-#include <utility>
+#include <migraphx/type_name.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/include/migraphx/op/neg.hpp
+++ b/src/include/migraphx/op/neg.hpp
@@ -3,7 +3,6 @@

 #include <array>
 #include <migraphx/op/unary.hpp>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
@@ -19,6 +18,7 @@ namespace op {

 struct neg : unary<neg>
 {
+    std::string point_function() const { return "-"; }
    auto apply() const
    {
        return [](auto x) { return -x; };

--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_NONMAXSUPPRESSION_HPP
+#define MIGRAPHX_GUARD_OPERATORS_NONMAXSUPPRESSION_HPP
+
+#include <cmath>
+#include <queue>
+#include <cstdint>
+#include <iterator>
+#include <migraphx/config.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/tensor_view.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/output_iterator.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct nonmaxsuppression
+{
+    bool center_point_box = false;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.center_point_box, "center_point_box"));
+    }
+
+    std::string name() const { return "nonmaxsuppression"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // requires at least 2 inputs
+        check_shapes{inputs, *this}.standard();
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.only_dims(3);
+        auto lens = inputs.front().lens();
+
+        // check input shape
+        if(lens[1] != inputs.at(1).lens()[2])
+        {
+            MIGRAPHX_THROW("NonMaxSuppression: dimension mismatch between first and second input!");
+        }
+
+        std::vector<int64_t> out_lens(2);
+        out_lens.at(0) = lens.at(1);
+        out_lens.at(1) = 3;
+        return {shape::int64_type, out_lens};
+    }
+
+    struct box
+    {
+        std::array<float, 2> x;
+        std::array<float, 2> y;
+
+        void sort()
+        {
+            std::sort(x.begin(), x.end());
+            std::sort(y.begin(), y.end());
+        }
+
+        std::array<float, 2>& operator[](std::size_t i) { return i == 0 ? x : y; }
+
+        float area() const
+        {
+            assert(std::is_sorted(x.begin(), x.end()));
+            assert(std::is_sorted(y.begin(), y.end()));
+            return (x[1] - x[0]) * (y[1] - y[0]);
+        }
+    };
+
+    template <class T>
+    box batch_box(const T* boxes, std::size_t bidx) const
+    {
+        box result{};
+        const T* start = boxes + 4 * bidx;
+        if(center_point_box)
+        {
+            float half_width  = start[2] / 2.0f;
+            float half_height = start[3] / 2.0f;
+            float x_center    = start[0];
+            float y_center    = start[1];
+            result.x          = {x_center - half_width, x_center + half_width};
+            result.y          = {y_center - half_height, y_center + half_height};
+        }
+        else
+        {
+            result.x = {start[1], start[3]};
+            result.y = {start[0], start[2]};
+        }
+
+        return result;
+    }
+
+    inline bool suppress_by_iou(box b1, box b2, float iou_threshold) const
+    {
+        b1.sort();
+        b2.sort();
+
+        box intersection{};
+        for(auto i : range(2))
+        {
+            intersection[i][0] = std::max(b1[i][0], b2[i][0]);
+            intersection[i][1] = std::min(b1[i][1], b2[i][1]);
+        }
+
+        std::vector<std::array<float, 2>> bbox = {intersection.x, intersection.y};
+        if(std::any_of(bbox.begin(), bbox.end(), [](auto bx) {
+               return not std::is_sorted(bx.begin(), bx.end());
+           }))
+        {
+            return false;
+        }
+
+        const float area1             = b1.area();
+        const float area2             = b2.area();
+        const float intersection_area = intersection.area();
+        const float union_area        = area1 + area2 - intersection_area;
+
+        if(area1 <= .0f or area2 <= .0f or union_area <= .0f)
+        {
+            return false;
+        }
+
+        const float intersection_over_union = intersection_area / union_area;
+
+        return intersection_over_union > iou_threshold;
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+
+        result.visit([&](auto out) { std::fill(out.begin(), out.end(), 0); });
+
+        std::size_t max_output_boxes_per_class = 0;
+        float iou_threshold                    = 0.0f;
+        float score_threshold                  = 0.0f;
+
+        if(args.size() > 2)
+        {
+            max_output_boxes_per_class = args.at(2).at<std::size_t>();
+        }
+        // max_output_boxes_per_class is 0, no output
+        if(max_output_boxes_per_class == 0)
+        {
+            return result;
+        }
+
+        if(args.size() > 3)
+        {
+            iou_threshold = args.at(3).at<float>();
+        }
+
+        if(args.size() > 4)
+        {
+            score_threshold = args.at(4).at<float>();
+        }
+
+        const auto& lens = args.at(1).get_shape().lens();
+        auto batch_num   = lens[0];
+        auto class_num   = lens[1];
+        auto box_num     = args.at(0).get_shape().lens()[1];
+
+        std::vector<std::pair<float, int64_t>> selected_boxes_inside_class;
+        std::vector<int64_t> selected_indices;
+        selected_boxes_inside_class.reserve(output_shape.elements());
+
+        auto scores        = make_view<float>(args.at(1).get_shape(), args.at(1).cast<float>());
+        const float* boxes = args.at(0).cast<float>();
+        shape comp_s{shape::float_type, {batch_num, class_num}};
+        shape_for_each(comp_s, [&](auto idx) {
+            auto bidx = idx[0];
+            auto cidx = idx[1];
+
+            std::size_t score_offset = (bidx * class_num + cidx) * box_num;
+            const float* batch_boxes = boxes + bidx * box_num * 4;
+            std::priority_queue<std::pair<float, int64_t>> sorted_boxes;
+            auto insert_to_sorted_boxes =
+                make_function_output_iterator([&](const auto& x) { sorted_boxes.push(x); });
+
+            int64_t box_idx = 0;
+            transform_if(
+                scores.begin() + score_offset,
+                scores.begin() + score_offset + box_num,
+                insert_to_sorted_boxes,
+                [&](auto sc) {
+                    box_idx++;
+                    return sc >= score_threshold;
+                },
+                [&](auto sc) { return std::make_pair(sc, box_idx - 1); });
+
+            selected_boxes_inside_class.clear();
+            // Get the next box with top score, filter by iou_threshold
+            while(!sorted_boxes.empty() &&
+                  selected_boxes_inside_class.size() < max_output_boxes_per_class)
+            {
+                const std::pair<float, int64_t>& next_top_score = sorted_boxes.top();
+
+                // Check with existing selected boxes for this class, suppress if exceed the IOU
+                // (Intersection Over Union) threshold
+                bool not_selected = std::any_of(
+                    selected_boxes_inside_class.begin(),
+                    selected_boxes_inside_class.end(),
+                    [&](auto selected_index) {
+                        return this->suppress_by_iou(batch_box(batch_boxes, next_top_score.second),
+                                                     batch_box(batch_boxes, selected_index.second),
+                                                     iou_threshold);
+                    });
+
+                if(not not_selected)
+                {
+                    selected_boxes_inside_class.push_back(next_top_score);
+                    selected_indices.push_back(bidx);
+                    selected_indices.push_back(cidx);
+                    selected_indices.push_back(next_top_score.second);
+                }
+                sorted_boxes.pop();
+            }
+        });
+
+        result.visit([&](auto out) {
+            std::copy(selected_indices.begin(), selected_indices.end(), out.begin());
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/nonzero.hpp
+++ b/src/include/migraphx/op/nonzero.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_NONZERO_HPP
+#define MIGRAPHX_GUARD_OPERATORS_NONZERO_HPP
+
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/par_for.hpp>
+#include <cmath>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct nonzero
+{
+    std::string name() const { return "nonzero"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1).standard();
+        auto elem_num                     = inputs[0].elements();
+        auto dim_num                      = inputs[0].lens().size();
+        std::vector<std::size_t> out_lens = {dim_num, elem_num};
+
+        return {shape::int64_type, out_lens};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        std::vector<std::vector<std::size_t>> vec_idx;
+        auto s = args.front().get_shape();
+        args.front().visit([&](auto v) {
+            shape_for_each(s, [&](auto idx) {
+                if(not float_equal(v[s.index(idx)], 0))
+                {
+                    vec_idx.push_back(idx);
+                }
+            });
+        });
+
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            std::fill(output.begin(), output.end(), 0);
+            par_for(vec_idx.size(), [&](auto i) {
+                for(std::size_t j = 0; j < vec_idx.front().size(); ++j)
+                {
+                    output[output_shape.index({j, i})] = vec_idx[i][j];
+                }
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/normalize_attribute.hpp
+++ b/src/include/migraphx/op/normalize_attribute.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_OP_NORMALIZE_ATTRIBUTE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_OP_NORMALIZE_ATTRIBUTE_HPP
+
+#include <migraphx/config.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+// different attributes
+// 1) use_input(default)/use_output
+// 2) use_rank(default)/use_len
+// 3) clip_min(default)/not_clip_min
+//   3.1) include_min(default)/exclude_min
+// 4) clip_max(default)/not_clip_max
+//   4.1) exclude_max(default)/include_max
+// 5) normalize padding
+enum class normalize_attribute
+{
+    use_len,
+    use_output,
+    clip_max,
+    clip_min,
+    include_max,
+    include_min,
+    normalize_padding
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/outline.hpp
+++ b/src/include/migraphx/op/outline.hpp
@@ -2,7 +2,6 @@
 #define MIGRAPHX_GUARD_OPERATORS_OUTLINE_HPP

 #include <array>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>

--- a/src/include/migraphx/op/pad.hpp
+++ b/src/include/migraphx/op/pad.hpp
@@ -2,7 +2,6 @@
 #define MIGRAPHX_GUARD_OPERATORS_PAD_HPP

 #include <array>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
@@ -51,6 +50,12 @@ struct pad
        return s;
    }

+    std::size_t pad_ndims() const
+    {
+        assert(pads.size() % 2 == 0);
+        return pads.size() / 2;
+    }
+
    bool symmetric() const
    {
        std::size_t num_dims = pads.size() / 2;

--- a/src/include/migraphx/op/pointwise.hpp
+++ b/src/include/migraphx/op/pointwise.hpp
+#ifndef MIGRAPHX_GUARD_OP_POINTWISE_HPP
+#define MIGRAPHX_GUARD_OP_POINTWISE_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/par_for.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct pointwise
+{
+    std::string name() const { return "pointwise"; }
+
+    shape compute_shape(const std::vector<shape>& inputs, std::vector<module_ref> mods) const
+    {
+        if(mods.size() != 1)
+        {
+            MIGRAPHX_THROW("should have one submodule.");
+        }
+        auto* pm    = mods.front();
+        auto pnames = pm->get_parameter_names();
+        std::sort(pnames.begin(), pnames.end());
+        check_shapes{inputs, *this}.has(pnames.size()).same_dims();
+
+        if(pm->get_output_shapes().size() != 1)
+            MIGRAPHX_THROW("submodule should have only one output.");
+
+        auto type = pm->get_output_shapes().front().type();
+
+        // Scalar output if all inputs are scalar
+        if(inputs.front().elements() == 1 and
+           all_of(inputs, [](const auto& s) { return s.scalar(); }))
+            return shape{type};
+
+        return shape::from_permutation(type, inputs.front().lens(), find_permutation(inputs));
+    }
+
+    argument compute(const shape& output_shape,
+                     const std::vector<argument>& args,
+                     const std::vector<module_ref>& mods,
+                     const std::function<std::vector<argument>(
+                         module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
+    {
+        argument output{output_shape};
+        auto* pm    = mods.front();
+        auto pnames = pm->get_parameter_names();
+        std::sort(pnames.begin(), pnames.end());
+
+        par_for(output_shape.elements(), [&](auto i) {
+            std::unordered_map<std::string, argument> params;
+
+            std::transform(
+                pnames.begin(),
+                pnames.end(),
+                args.begin(),
+                std::inserter(params, params.end()),
+                [&](auto&& name, auto&& arg) { return std::make_pair(name, arg.element(i)); });
+
+            auto results = run(pm, params);
+            assert(results.size() == 1);
+            visit_all(output, results.front())([&](auto out, auto x) { out[i] = x.front(); });
+        });
+        return output;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_OP_POINTWISE_HPP
--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
@@ -3,11 +3,13 @@

 #include <array>
 #include <migraphx/op/common.hpp>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
+#include <migraphx/functional.hpp>
 #include <migraphx/literal.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/value.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/int_divide.hpp>
 #include <migraphx/config.hpp>
@@ -15,54 +17,188 @@
 #include <utility>

 namespace migraphx {
+
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {

 struct pooling
 {
-    std::string mode                   = "average";
-    std::array<std::size_t, 2> padding = {{0, 0}};
-    std::array<std::size_t, 2> stride  = {{1, 1}};
-    std::array<std::size_t, 2> lengths = {{1, 1}};
-    padding_mode_t padding_mode        = default_;
+    pooling_mode mode                = {pooling_mode::average};
+    std::vector<std::size_t> padding = {0, 0};
+    std::vector<std::size_t> stride  = {1, 1};
+    std::vector<std::size_t> lengths = {1, 1};
+    bool ceil_mode                   = false;
+    int lp_order                     = 2;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
        return pack(f(self.mode, "mode"),
                    f(self.padding, "padding"),
-                    f(self.padding_mode, "padding_mode"),
                    f(self.stride, "stride"),
-                    f(self.lengths, "lengths"));
+                    f(self.lengths, "lengths"),
+                    f(self.ceil_mode, "ceil_mode"),
+                    f(self.lp_order, "lp_order"));
    }

    std::string name() const { return "pooling"; }

-    shape compute_shape(std::vector<shape> inputs) const
+    void check_attribute_size() const
+    {
+        if(not((padding.size() == stride.size() or (padding.size() / 2) == stride.size()) and
+               stride.size() == lengths.size()))
+        {
+            MIGRAPHX_THROW("POOLING: inconsistent attribute sizes");
+        }
+    }
+
+    value attributes() const { return {{"normalize_padding", "padding"}}; }
+
+    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1).only_dims(4);
+        check_shapes{inputs, *this}.has(1);

        const shape& input = inputs.at(0);
-        auto t             = input.type();

-        assert(lengths[0] <= (input.lens()[2] + 2 * padding[0]));
-        assert(lengths[1] <= (input.lens()[3] + 2 * padding[1]));
+        auto input_lens   = input.lens();
+        size_t kdims      = input_lens.size() - 2;
+        auto input_size   = inputs[0].lens().size();
+        auto padding_size = padding.size();
+        if(not(input_size == padding_size / 2 + 2 or input_size == padding_size + 2))
+        {
+            MIGRAPHX_THROW("POOLING: input and attribute size mismatch!");
+        }
+
+        std::vector<std::size_t> output_lens(input_lens.begin(), input_lens.begin() + 2);
+
+        for(size_t i = 0; i < kdims; i++)
+        {
+            std::ptrdiff_t dim_size;
+            auto padding_factor = 2 * padding[i];
+            if(padding_size == 2 * kdims)
+                padding_factor = padding[i] + padding[i + kdims];
+            dim_size = input_lens[i + 2] + padding_factor - lengths[i];
+            assert(dim_size >= 0);
+            std::size_t len = (ceil_mode) ? ceil_divide<std::ptrdiff_t>(dim_size, stride[i])
+                                          : floor_divide<std::ptrdiff_t>(dim_size, stride[i]);
+
+            output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(1, len + 1)));
+        }
+        return inputs[0].with_lens(output_lens);
+    }
+
+    size_t kdims() const
+    {
+        check_attribute_size();
+        return stride.size();
+    }
+
+    struct lpnorm_pool
+    {
+        int p = 0;
+
+        lpnorm_pool() = delete;
+
+        explicit lpnorm_pool(int x) : p{x} {};
+
+        template <class T>
+        double init() const
+        {
+            return 0.0;
+        }

-        return {t,
+        double operator()(double x, double y) const { return x + std::pow(std::abs(y), p); }
+
+        double final(double x, std::size_t) const { return std::pow(x, 1. / p); }
+    };
+
+    struct avg_pool
+    {
+        template <class T>
+        double init() const
+        {
+            return 0.0;
+        }
+
+        double operator()(double x, double y) const { return x + y; }
+
+        double final(double x, std::size_t y) const { return (y == 0) ? 0.0 : (x / y); }
+    };
+
+    struct max_pool
+    {
+        template <class T>
+        T init() const
+        {
+            return std::numeric_limits<T>::lowest();
+        }
+
+        double operator()(double x, double y) const { return std::max(x, y); }
+
+        double final(double x, std::size_t) const { return (x); }
+    };
+
+    template <class Type, class Out, class In, class Op>
+    void calc_pooling(const shape& output_shape, Out& output, const In& input, Op op) const
+    {
+        auto in_s    = input.get_shape();
+        auto in_lens = in_s.lens();
+        par_for(output_shape.elements(), [&](auto i) {
+            auto idx_o = output_shape.multi(i);
+            auto n_dim = idx_o.size();
+            std::vector<std::size_t> win_start;
+            std::vector<std::size_t> win_size;
+            for(std::size_t dim = 2; dim < n_dim; ++dim)
+            {
+                auto d_2 = dim - 2;
+                int start =
+                    static_cast<int>(idx_o[dim] * stride[d_2]) - static_cast<int>(padding[d_2]);
+                int end = std::min(start + lengths[d_2], in_lens[dim]);
+                start   = std::max(start, 0);
+                win_start.push_back(start);
+                win_size.push_back(end - start);
+            }
+
+            shape win_shape{output_shape.type(), win_size};
+            auto pool_size    = win_shape.elements();
+            double output_val = op.template init<Type>();
+            shape_for_each(win_shape, [&](auto idx_w) {
+                auto idx = idx_o;
+                std::transform(idx_w.begin(),
+                               idx_w.end(),
+                               win_start.begin(),
+                               idx.begin() + 2,
+                               [](auto ii, auto jj) { return ii + jj; });
+                if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
+                   idx < in_lens)
                {
-                    input.lens()[0],
-                    input.lens()[1],
-                    std::size_t(std::max<std::ptrdiff_t>(
-                        1,
-                        floor_divide<std::ptrdiff_t>(input.lens()[2] + 2 * padding[0] - lengths[0],
-                                                     stride[0]) +
-                            1)),
-                    std::size_t(std::max<std::ptrdiff_t>(
-                        1,
-                        floor_divide<std::ptrdiff_t>(input.lens()[3] + 2 * padding[1] - lengths[1],
-                                                     stride[1]) +
-                            1)),
-                }};
+                    output_val = op(output_val, input[in_s.index(idx)]);
+                }
+            });
+            output[i] = Type(op.final(output_val, pool_size));
+        });
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using type = typename decltype(output)::value_type;
+            switch(mode)
+            {
+            case migraphx::op::pooling_mode::average:
+                calc_pooling<type>(output_shape, output, input, avg_pool{});
+                break;
+            case migraphx::op::pooling_mode::max:
+                calc_pooling<type>(output_shape, output, input, max_pool{});
+                break;
+            case migraphx::op::pooling_mode::lpnorm:
+                calc_pooling<type>(output_shape, output, input, lpnorm_pool{lp_order});
+                break;
+            }
+        });
+
+        return result;
    }
 };


--- a/src/include/migraphx/op/prefix_scan_op.hpp
+++ b/src/include/migraphx/op/prefix_scan_op.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCAN_OP_HPP
+
+#include <migraphx/op/name.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+template <class Derived>
+struct prefix_scan_op : op_name<Derived>
+{
+    int64_t axis;
+    bool exclusive = false;
+    bool reverse   = false;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(
+            f(self.axis, "axis"), f(self.exclusive, "exclusive"), f(self.reverse, "reverse"));
+    }
+
+    value attributes() const
+    {
+        value normalize;
+        normalize["axis"] = value::array{normalize_attribute::include_min};
+        return {{"normalize_axes", normalize}};
+    }
+
+    shape normalize_compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        auto s = inputs.front();
+        if(s.broadcasted())
+        {
+            return {s.type(), s.lens()};
+        }
+        else
+        {
+            return s.with_lens(s.lens());
+        }
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto s = args[0].get_shape();
+        if(s == output_shape)
+        {
+            result = args[0].copy();
+        }
+        else
+        {
+            visit_all(result, args[0])([&](auto output, auto input) {
+                par_for(output_shape.elements(),
+                        [&](auto i) { output[output_shape.index(i)] = input[s.index(i)]; });
+            });
+            s = output_shape;
+        }
+        auto slice = shape{s.type(), {s.lens()[axis]}, {s.strides()[axis]}};
+        auto lens  = s.lens();
+        lens[axis] = 1;
+        auto batch = shape{s.type(), lens, s.strides()};
+        auto& self = static_cast<const Derived&>(*this);
+        result.visit([&](auto output) {
+            using type = decltype(output);
+            par_for(batch.elements(), [&](auto i) {
+                auto* start = output.data() + batch.index(i);
+                type x{slice, start};
+                if(reverse)
+                {
+                    if(exclusive)
+                    {
+                        std::copy(++x.begin(), x.end(), x.begin());
+                        x.back() = 0;
+                    }
+                    std::partial_sum(std::make_reverse_iterator(x.end()),
+                                     std::make_reverse_iterator(x.begin()),
+                                     std::make_reverse_iterator(x.end()),
+                                     self.op());
+                }
+                else
+                {
+                    if(exclusive)
+                    {
+                        std::copy_backward(x.begin(), --x.end(), x.end());
+                        x.front() = 0;
+                    }
+                    std::partial_sum(x.begin(), x.end(), x.begin(), self.op());
+                }
+            });
+        });
+
+        return result;
+    }
+
+    auto init() const {}
+    prefix_scan_op() : axis(0) {}
+    prefix_scan_op(int64_t ax) : axis(ax) {}
+    prefix_scan_op(int64_t ax, bool excl) : axis(ax), exclusive(excl) {}
+    prefix_scan_op(int64_t ax, bool excl, bool rev) : axis(ax), exclusive(excl), reverse(rev) {}
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif