Merge

7e297b13 · Paul · 86ea5e91 · aa7ff911 · 7e297b13 · 7e297b13
Commit 7e297b13 authored Jun 13, 2022 by Paul
20 changed files
--- a/src/include/migraphx/op/gather.hpp
+++ b/src/include/migraphx/op/gather.hpp
@@ -38,7 +38,7 @@ struct gather

    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).standard();
+        check_shapes{inputs, *this}.has(2);
        auto lens = inputs[0].lens();
        auto type = inputs[0].type();
        lens.erase(lens.begin() + axis);

--- a/src/include/migraphx/op/gathernd.hpp
+++ b/src/include/migraphx/op/gathernd.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_GATHERND_HPP
+#define MIGRAPHX_GUARD_OPERATORS_GATHERND_HPP
+
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/par_for.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct gathernd
+{
+    int batch_dims = 0;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.batch_dims, "batch_dims"));
+    }
+
+    std::string name() const { return "gathernd"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        auto r = inputs.front().lens().size();
+        auto q = inputs.back().lens().size();
+        auto k = inputs.back().lens().back();
+        if(k > r - batch_dims)
+        {
+            MIGRAPHX_THROW("GATHERND: Indices of length " + std::to_string(k) +
+                           " cannot be used to access data of rank " +
+                           std::to_string(r - batch_dims));
+        }
+        auto indices_lens_iter = inputs.back().lens().begin();
+        auto output_lens_size  = q + r - k - batch_dims - 1;
+        std::vector<std::size_t> output_lens(output_lens_size);
+        std::copy(indices_lens_iter, indices_lens_iter + (q - 1), output_lens.begin());
+        if(k < r - batch_dims)
+        {
+            auto data_lens = inputs.front().lens();
+            std::copy(
+                data_lens.begin() + batch_dims + k, data_lens.end(), output_lens.begin() + q - 1);
+        }
+        shape output_shape{inputs.front().type(), output_lens};
+        return output_shape;
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto data) {
+            args[1].visit([&](auto indices) {
+                auto indices_shape        = indices.get_shape();
+                auto indices_shape_lens   = indices_shape.lens();
+                auto data_shape           = data.get_shape();
+                auto data_shape_lens      = data_shape.lens();
+                auto k                    = indices_shape.lens().back();
+                const auto num_slice_dims = k;
+                std::size_t num_slices    = std::accumulate(indices_shape_lens.begin(),
+                                                         indices_shape_lens.end() - 1,
+                                                         1,
+                                                         std::multiplies<std::size_t>());
+                std::size_t slice_size  = std::accumulate(data_shape_lens.begin() + k + batch_dims,
+                                                         data_shape_lens.end(),
+                                                         1,
+                                                         std::multiplies<std::size_t>());
+                std::size_t num_batches = std::accumulate(data_shape_lens.begin(),
+                                                          data_shape_lens.begin() + batch_dims,
+                                                          1,
+                                                          std::multiplies<std::size_t>());
+                std::size_t data_batch_stride =
+                    std::accumulate(data_shape_lens.begin() + batch_dims,
+                                    data_shape_lens.end(),
+                                    1,
+                                    std::multiplies<std::size_t>());
+                auto num_slices_per_batch = num_slices / num_batches;
+
+                std::vector<std::size_t> sizes_from_slice_dims(num_slice_dims);
+                {
+                    auto running_product = slice_size;
+                    for(std::size_t i = 0; i < num_slice_dims; ++i)
+                    {
+                        sizes_from_slice_dims[num_slice_dims - 1 - i] = running_product;
+                        running_product *= data_shape_lens[batch_dims + num_slice_dims - 1 - i];
+                    }
+                }
+
+                std::vector<std::size_t> input_slice_offsets(num_slices);
+                par_for(num_slices, [&](const auto i) {
+                    std::size_t batch_idx = i / num_slices_per_batch;
+
+                    auto slice_indices                = indices.begin() + (i * num_slice_dims);
+                    std::size_t relative_slice_offset = 0;
+                    for(size_t dim_idx = 0; dim_idx < num_slice_dims; ++dim_idx)
+                    {
+                        int64_t index                   = *(slice_indices + dim_idx);
+                        const std::size_t input_dim_idx = batch_dims + dim_idx;
+                        const auto input_dim            = data_shape_lens[input_dim_idx];
+                        if(index < -static_cast<int64_t>(input_dim) or
+                           index >= static_cast<int64_t>(input_dim))
+                            MIGRAPHX_THROW("GatherND: index " + std::to_string(index) +
+                                           " is out of bounds for dim of len " +
+                                           std::to_string(input_dim));
+                        if(index < 0)
+                            index += input_dim;
+
+                        relative_slice_offset += index * sizes_from_slice_dims[dim_idx];
+                    }
+
+                    input_slice_offsets[i] =
+                        (batch_idx * data_batch_stride) + relative_slice_offset;
+                });
+
+                par_for(num_slices * slice_size, [&](const auto i) {
+                    auto slice_offset = input_slice_offsets[i / slice_size];
+                    output[i]         = data[slice_offset + i % slice_size];
+                });
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/get_tuple_elem.hpp
+++ b/src/include/migraphx/op/get_tuple_elem.hpp
@@ -45,6 +45,8 @@ struct get_tuple_elem
        assert(index < vec_args.size());
        return vec_args.at(index);
    }
+
+    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };

 } // namespace op

--- a/src/include/migraphx/op/if_op.hpp
+++ b/src/include/migraphx/op/if_op.hpp
@@ -35,7 +35,7 @@ struct if_op
            MIGRAPHX_THROW("IF: output shapes of submodules must be the same.");
        }

-        return shape(out_shapes0);
+        return {out_shapes0};
    }

    argument compute(const shape&,

--- a/src/include/migraphx/op/isnan.hpp
+++ b/src/include/migraphx/op/isnan.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_ISNAN_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ISNAN_HPP
+
+#include <migraphx/op/unary.hpp>
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct isnan : unary<isnan>
+{
+    auto apply() const
+    {
+        return [](auto x) { return std::isnan(x); };
+    }
+
+    std::string name() const { return "isnan"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        return unary<isnan>::compute_shape(std::move(inputs)).with_type(shape::bool_type);
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/load.hpp
+++ b/src/include/migraphx/op/load.hpp
@@ -35,7 +35,7 @@ struct load
    {
        if((offset + s.bytes()) > args[0].get_shape().bytes())
            MIGRAPHX_THROW("Load access is out of bounds");
-        return argument::load(s, args[0].data() + offset);
+        return argument{s, args[0].data() + offset};
    }
    lifetime get_lifetime() const { return lifetime::borrow; }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }

--- a/src/include/migraphx/op/loop.hpp
+++ b/src/include/migraphx/op/loop.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_LOOP_HPP
+#define MIGRAPHX_GUARD_OPERATORS_LOOP_HPP
+
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/functional.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/run_loop.hpp>
+#include <migraphx/ranges.hpp>
+#include <cmath>
+#include <string>
+#include <utility>
+#include <set>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct loop
+{
+    int64_t max_iterations = 10;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.max_iterations, "max_iterations"));
+    }
+
+    std::string name() const { return "loop"; }
+
+    shape compute_shape(const std::vector<shape>& inputs, std::vector<module_ref> mods) const
+    {
+        check_shapes{inputs, *this}.standard();
+        if(mods.size() != 1)
+        {
+            MIGRAPHX_THROW("LOOP: operator should have one submodule.");
+        }
+
+        const auto& mod     = mods.front();
+        auto mod_out_shapes = mod->get_output_shapes();
+        auto dep_param_num  = inputs.size() - 2;
+
+        // first item of the mod output shapes is condition used in loop,
+        // which is not needed to compute output shape
+        mod_out_shapes.erase(mod_out_shapes.begin());
+        std::vector<shape> ins_out_shapes(mod_out_shapes.begin(),
+                                          mod_out_shapes.begin() + dep_param_num);
+        mod_out_shapes.erase(mod_out_shapes.begin(), mod_out_shapes.begin() + dep_param_num);
+        for(const auto& out_s : mod_out_shapes)
+        {
+            auto lens = out_s.lens();
+            lens.insert(lens.begin(), max_iterations);
+            ins_out_shapes.push_back({out_s.type(), lens});
+        }
+
+        return {ins_out_shapes};
+    }
+
+    struct ref_loop
+    {
+        int64_t max_iterations = 0;
+
+        template <class T>
+        void copy(context&, const argument& src, T& dst) const
+        {
+            dst = *src.cast<T>();
+        }
+
+        template <class T>
+        void copy(context&, T src, const argument& dst) const
+        {
+            *dst.cast<T>() = src;
+        }
+
+        void append(const std::vector<argument>& iter_state,
+                    const std::vector<argument>& concatenated_outputs,
+                    int iter) const
+        {
+            assert(iter_state.size() == concatenated_outputs.size());
+            for(auto i : range(iter_state.size()))
+            {
+                const auto& iter_stat = iter_state.at(i);
+                const auto& scan_out  = concatenated_outputs.at(i);
+
+                auto* in_data        = iter_stat.data();
+                auto* out_data       = scan_out.data();
+                std::size_t out_size = iter_stat.get_shape().bytes();
+                assert((iter + 1) * out_size <= scan_out.get_shape().bytes());
+                std::copy(in_data, in_data + out_size, out_data + iter * out_size);
+            }
+        }
+
+        void set_zero(context&, const std::vector<argument>& concatenated_outputs, int iter) const
+        {
+            if(iter >= max_iterations)
+                return;
+
+            for(const auto& out : concatenated_outputs)
+            {
+                auto s    = out.get_shape();
+                auto size = s.bytes() / max_iterations;
+                std::fill(out.data() + iter * size, out.data() + max_iterations * size, 0);
+            }
+        }
+
+        std::unordered_map<std::string, int> get_output_params(const module&) const { return {}; }
+    };
+
+    argument compute(context& ctx,
+                     const shape& out_shape,
+                     const std::vector<argument>& args,
+                     const std::vector<module_ref>& mods,
+                     const std::function<std::vector<argument>(
+                         module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
+    {
+        // wrap up the arguments vector, so ref and gpu impl are the same
+        auto cpy_args = args;
+        bool in_cond  = args.at(1).at<bool>();
+        bool cond     = in_cond;
+        int64_t iter  = 0;
+        // insert iter and cond used in the loop
+        auto s_cond = args.at(1).get_shape();
+        auto s_iter = args.at(0).get_shape();
+        cpy_args.push_back({s_iter, &iter});
+        cpy_args.push_back({s_cond, &cond});
+        cpy_args.insert(cpy_args.end(), args.begin() + 2, args.end());
+
+        // add cond and mod outputs to the argument list
+        cpy_args.push_back(argument(s_cond));
+        cpy_args.push_back(argument(out_shape));
+
+        // run loop
+        return run_loop(ref_loop{max_iterations}, ctx, cpy_args, mods, run);
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/multibroadcast.hpp
+++ b/src/include/migraphx/op/multibroadcast.hpp
@@ -23,7 +23,7 @@ struct multibroadcast
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.output_lens, "output_lens"));
+        return pack(f(self.output_lens, "out_lens"));
    }

    std::string name() const { return "multibroadcast"; }
@@ -69,7 +69,6 @@ struct multibroadcast
    {
        return args[0].reshape(output_shape);
    }
-    lifetime get_lifetime() const { return lifetime::borrow; }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };


--- a/src/include/migraphx/op/multinomial.hpp
+++ b/src/include/migraphx/op/multinomial.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
+#define MIGRAPHX_GUARD_OPERATORS_MULTINOMIAL_HPP
+
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/par_for.hpp>
+#include <random>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct multinomial
+{
+    shape::type_t dtype = shape::type_t::int32_type;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.dtype, "dtype"));
+    }
+
+    std::string name() const { return "multinomial"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2).only_dims(2);
+        size_t sample_size = inputs.back().lens().back();
+
+        if(not contains({shape::int32_type, shape::int64_type}, dtype))
+            MIGRAPHX_THROW(
+                "Multinomial: Invalid output type. Valid types are int32_type and int64_type.");
+
+        return {dtype, {inputs.front().lens().front(), sample_size}};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        size_t batch_size  = output_shape.lens().front();
+        size_t class_size  = args[0].get_shape().lens().back();
+        size_t sample_size = output_shape.lens().back();
+
+        visit_all(args[0], args[1])([&](auto cdf, auto dist) {
+            result.visit([&](auto output) {
+                par_for(batch_size * sample_size, [&](auto i) {
+                    auto idx       = args[1].get_shape().multi(i);
+                    auto cdf_begin = cdf.begin() + (idx[0] * class_size);
+                    auto cdf_end   = cdf_begin + class_size;
+                    auto sample_iter =
+                        std::upper_bound(cdf_begin, cdf_end, dist[i] * *(std::prev(cdf_end)));
+                    output[i] = std::distance(cdf_begin, sample_iter);
+                });
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_NONMAXSUPPRESSION_HPP
+#define MIGRAPHX_GUARD_OPERATORS_NONMAXSUPPRESSION_HPP
+
+#include <cmath>
+#include <queue>
+#include <cstdint>
+#include <iterator>
+#include <migraphx/config.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/algorithm.hpp>
+#include <migraphx/tensor_view.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/output_iterator.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct nonmaxsuppression
+{
+    bool center_point_box = false;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.center_point_box, "center_point_box"));
+    }
+
+    std::string name() const { return "nonmaxsuppression"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        // requires at least 2 inputs
+        check_shapes{inputs, *this}.standard();
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.only_dims(3);
+        auto lens = inputs.front().lens();
+
+        // check input shape
+        if(lens[1] != inputs.at(1).lens()[2])
+        {
+            MIGRAPHX_THROW("NonMaxSuppression: dimension mismatch between first and second input!");
+        }
+
+        std::vector<int64_t> out_lens(2);
+        out_lens.at(0) = lens.at(1);
+        out_lens.at(1) = 3;
+        return {shape::int64_type, out_lens};
+    }
+
+    struct box
+    {
+        std::array<float, 2> x;
+        std::array<float, 2> y;
+
+        void sort()
+        {
+            std::sort(x.begin(), x.end());
+            std::sort(y.begin(), y.end());
+        }
+
+        std::array<float, 2>& operator[](std::size_t i) { return i == 0 ? x : y; }
+
+        float area() const
+        {
+            assert(std::is_sorted(x.begin(), x.end()));
+            assert(std::is_sorted(y.begin(), y.end()));
+            return (x[1] - x[0]) * (y[1] - y[0]);
+        }
+    };
+
+    template <class T>
+    box batch_box(const T* boxes, std::size_t bidx) const
+    {
+        box result{};
+        const T* start = boxes + 4 * bidx;
+        if(center_point_box)
+        {
+            float half_width  = start[2] / 2.0f;
+            float half_height = start[3] / 2.0f;
+            float x_center    = start[0];
+            float y_center    = start[1];
+            result.x          = {x_center - half_width, x_center + half_width};
+            result.y          = {y_center - half_height, y_center + half_height};
+        }
+        else
+        {
+            result.x = {start[1], start[3]};
+            result.y = {start[0], start[2]};
+        }
+
+        return result;
+    }
+
+    inline bool suppress_by_iou(box b1, box b2, float iou_threshold) const
+    {
+        b1.sort();
+        b2.sort();
+
+        box intersection{};
+        for(auto i : range(2))
+        {
+            intersection[i][0] = std::max(b1[i][0], b2[i][0]);
+            intersection[i][1] = std::min(b1[i][1], b2[i][1]);
+        }
+
+        std::vector<std::array<float, 2>> bbox = {intersection.x, intersection.y};
+        if(std::any_of(bbox.begin(), bbox.end(), [](auto bx) {
+               return not std::is_sorted(bx.begin(), bx.end());
+           }))
+        {
+            return false;
+        }
+
+        const float area1             = b1.area();
+        const float area2             = b2.area();
+        const float intersection_area = intersection.area();
+        const float union_area        = area1 + area2 - intersection_area;
+
+        if(area1 <= .0f or area2 <= .0f or union_area <= .0f)
+        {
+            return false;
+        }
+
+        const float intersection_over_union = intersection_area / union_area;
+
+        return intersection_over_union > iou_threshold;
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+
+        result.visit([&](auto out) { std::fill(out.begin(), out.end(), 0); });
+
+        std::size_t max_output_boxes_per_class = 0;
+        float iou_threshold                    = 0.0f;
+        float score_threshold                  = 0.0f;
+
+        if(args.size() > 2)
+        {
+            max_output_boxes_per_class = args.at(2).at<std::size_t>();
+        }
+        // max_output_boxes_per_class is 0, no output
+        if(max_output_boxes_per_class == 0)
+        {
+            return result;
+        }
+
+        if(args.size() > 3)
+        {
+            iou_threshold = args.at(3).at<float>();
+        }
+
+        if(args.size() > 4)
+        {
+            score_threshold = args.at(4).at<float>();
+        }
+
+        const auto& lens = args.at(1).get_shape().lens();
+        auto batch_num   = lens[0];
+        auto class_num   = lens[1];
+        auto box_num     = args.at(0).get_shape().lens()[1];
+
+        std::vector<std::pair<float, int64_t>> selected_boxes_inside_class;
+        std::vector<int64_t> selected_indices;
+        selected_boxes_inside_class.reserve(output_shape.elements());
+
+        auto scores        = make_view<float>(args.at(1).get_shape(), args.at(1).cast<float>());
+        const float* boxes = args.at(0).cast<float>();
+        shape comp_s{shape::float_type, {batch_num, class_num}};
+        shape_for_each(comp_s, [&](auto idx) {
+            auto bidx = idx[0];
+            auto cidx = idx[1];
+
+            std::size_t score_offset = (bidx * class_num + cidx) * box_num;
+            const float* batch_boxes = boxes + bidx * box_num * 4;
+            std::priority_queue<std::pair<float, int64_t>> sorted_boxes;
+            auto insert_to_sorted_boxes =
+                make_function_output_iterator([&](const auto& x) { sorted_boxes.push(x); });
+
+            int64_t box_idx = 0;
+            transform_if(
+                scores.begin() + score_offset,
+                scores.begin() + score_offset + box_num,
+                insert_to_sorted_boxes,
+                [&](auto sc) {
+                    box_idx++;
+                    return sc >= score_threshold;
+                },
+                [&](auto sc) { return std::make_pair(sc, box_idx - 1); });
+
+            selected_boxes_inside_class.clear();
+            // Get the next box with top score, filter by iou_threshold
+            while(!sorted_boxes.empty() &&
+                  selected_boxes_inside_class.size() < max_output_boxes_per_class)
+            {
+                const std::pair<float, int64_t>& next_top_score = sorted_boxes.top();
+
+                // Check with existing selected boxes for this class, suppress if exceed the IOU
+                // (Intersection Over Union) threshold
+                bool not_selected = std::any_of(
+                    selected_boxes_inside_class.begin(),
+                    selected_boxes_inside_class.end(),
+                    [&](auto selected_index) {
+                        return this->suppress_by_iou(batch_box(batch_boxes, next_top_score.second),
+                                                     batch_box(batch_boxes, selected_index.second),
+                                                     iou_threshold);
+                    });
+
+                if(not not_selected)
+                {
+                    selected_boxes_inside_class.push_back(next_top_score);
+                    selected_indices.push_back(bidx);
+                    selected_indices.push_back(cidx);
+                    selected_indices.push_back(next_top_score.second);
+                }
+                sorted_boxes.pop();
+            }
+        });
+
+        result.visit([&](auto out) {
+            std::copy(selected_indices.begin(), selected_indices.end(), out.begin());
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/nonzero.hpp
+++ b/src/include/migraphx/op/nonzero.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_NONZERO_HPP
+#define MIGRAPHX_GUARD_OPERATORS_NONZERO_HPP
+
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/float_equal.hpp>
+#include <migraphx/par_for.hpp>
+#include <cmath>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct nonzero
+{
+    std::string name() const { return "nonzero"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1).standard();
+        auto elem_num                     = inputs[0].elements();
+        auto dim_num                      = inputs[0].lens().size();
+        std::vector<std::size_t> out_lens = {dim_num, elem_num};
+
+        return {shape::int64_type, out_lens};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        std::vector<std::vector<std::size_t>> vec_idx;
+        auto s = args.front().get_shape();
+        args.front().visit([&](auto v) {
+            shape_for_each(s, [&](auto idx) {
+                if(not float_equal(v[s.index(idx)], 0))
+                {
+                    vec_idx.push_back(idx);
+                }
+            });
+        });
+
+        argument result{output_shape};
+        result.visit([&](auto output) {
+            std::fill(output.begin(), output.end(), 0);
+            par_for(vec_idx.size(), [&](auto i) {
+                for(std::size_t j = 0; j < vec_idx.front().size(); ++j)
+                {
+                    output[output_shape.index({j, i})] = vec_idx[i][j];
+                }
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/pointwise.hpp
+++ b/src/include/migraphx/op/pointwise.hpp
+#ifndef MIGRAPHX_GUARD_OP_POINTWISE_HPP
+#define MIGRAPHX_GUARD_OP_POINTWISE_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/permutation.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/par_for.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct pointwise
+{
+    std::string name() const { return "pointwise"; }
+
+    shape compute_shape(const std::vector<shape>& inputs, std::vector<module_ref> mods) const
+    {
+        if(mods.size() != 1)
+        {
+            MIGRAPHX_THROW("should have one submodule.");
+        }
+        auto* pm    = mods.front();
+        auto pnames = pm->get_parameter_names();
+        std::sort(pnames.begin(), pnames.end());
+        check_shapes{inputs, *this}.has(pnames.size()).same_dims();
+
+        if(pm->get_output_shapes().size() != 1)
+            MIGRAPHX_THROW("submodule should have only one output.");
+
+        auto type = pm->get_output_shapes().front().type();
+
+        // Scalar output if all inputs are scalar
+        if(inputs.front().elements() == 1 and
+           all_of(inputs, [](const auto& s) { return s.scalar(); }))
+            return shape{type};
+
+        return shape::from_permutation(type, inputs.front().lens(), find_permutation(inputs));
+    }
+
+    argument compute(const shape& output_shape,
+                     const std::vector<argument>& args,
+                     const std::vector<module_ref>& mods,
+                     const std::function<std::vector<argument>(
+                         module_ref&, const std::unordered_map<std::string, argument>&)>& run) const
+    {
+        argument output{output_shape};
+        auto* pm    = mods.front();
+        auto pnames = pm->get_parameter_names();
+        std::sort(pnames.begin(), pnames.end());
+
+        par_for(output_shape.elements(), [&](auto i) {
+            std::unordered_map<std::string, argument> params;
+
+            std::transform(
+                pnames.begin(),
+                pnames.end(),
+                args.begin(),
+                std::inserter(params, params.end()),
+                [&](auto&& name, auto&& arg) { return std::make_pair(name, arg.element(i)); });
+
+            auto results = run(pm, params);
+            assert(results.size() == 1);
+            visit_all(output, results.front())([&](auto out, auto x) { out[i] = x.front(); });
+        });
+        return output;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_OP_POINTWISE_HPP
--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
@@ -8,6 +8,7 @@
 #include <migraphx/streamutils.hpp>
 #include <migraphx/functional.hpp>
 #include <migraphx/literal.hpp>
+#include <migraphx/par_for.hpp>
 #include <migraphx/value.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/int_divide.hpp>
@@ -16,16 +17,18 @@
 #include <utility>

 namespace migraphx {
+
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {

 struct pooling
 {
-    std::string mode                 = "average";
+    pooling_mode mode                = {pooling_mode::average};
    std::vector<std::size_t> padding = {0, 0};
    std::vector<std::size_t> stride  = {1, 1};
    std::vector<std::size_t> lengths = {1, 1};
    bool ceil_mode                   = false;
+    int lp_order                     = 2;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -34,7 +37,8 @@ struct pooling
                    f(self.padding, "padding"),
                    f(self.stride, "stride"),
                    f(self.lengths, "lengths"),
-                    f(self.ceil_mode, "ceil_mode"));
+                    f(self.ceil_mode, "ceil_mode"),
+                    f(self.lp_order, "lp_order"));
    }

    std::string name() const { return "pooling"; }
@@ -88,6 +92,114 @@ struct pooling
        check_attribute_size();
        return stride.size();
    }
+
+    struct lpnorm_pool
+    {
+        int p = 0;
+
+        lpnorm_pool() = delete;
+
+        explicit lpnorm_pool(int x) : p{x} {};
+
+        template <class T>
+        double init() const
+        {
+            return 0.0;
+        }
+
+        double operator()(double x, double y) const { return x + std::pow(std::abs(y), p); }
+
+        double final(double x, std::size_t) const { return std::pow(x, 1. / p); }
+    };
+
+    struct avg_pool
+    {
+        template <class T>
+        double init() const
+        {
+            return 0.0;
+        }
+
+        double operator()(double x, double y) const { return x + y; }
+
+        double final(double x, std::size_t y) const { return (y == 0) ? 0.0 : (x / y); }
+    };
+
+    struct max_pool
+    {
+        template <class T>
+        T init() const
+        {
+            return std::numeric_limits<T>::lowest();
+        }
+
+        double operator()(double x, double y) const { return std::max(x, y); }
+
+        double final(double x, std::size_t) const { return (x); }
+    };
+
+    template <class Type, class Out, class In, class Op>
+    void calc_pooling(const shape& output_shape, Out& output, const In& input, Op op) const
+    {
+        auto in_s    = input.get_shape();
+        auto in_lens = in_s.lens();
+        par_for(output_shape.elements(), [&](auto i) {
+            auto idx_o = output_shape.multi(i);
+            auto n_dim = idx_o.size();
+            std::vector<std::size_t> win_start;
+            std::vector<std::size_t> win_size;
+            for(std::size_t dim = 2; dim < n_dim; ++dim)
+            {
+                auto d_2 = dim - 2;
+                int start =
+                    static_cast<int>(idx_o[dim] * stride[d_2]) - static_cast<int>(padding[d_2]);
+                int end = std::min(start + lengths[d_2], in_lens[dim]);
+                start   = std::max(start, 0);
+                win_start.push_back(start);
+                win_size.push_back(end - start);
+            }
+
+            shape win_shape{output_shape.type(), win_size};
+            auto pool_size    = win_shape.elements();
+            double output_val = op.template init<Type>();
+            shape_for_each(win_shape, [&](auto idx_w) {
+                auto idx = idx_o;
+                std::transform(idx_w.begin(),
+                               idx_w.end(),
+                               win_start.begin(),
+                               idx.begin() + 2,
+                               [](auto ii, auto jj) { return ii + jj; });
+                if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
+                   idx < in_lens)
+                {
+                    output_val = op(output_val, input[in_s.index(idx)]);
+                }
+            });
+            output[i] = Type(op.final(output_val, pool_size));
+        });
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using type = typename decltype(output)::value_type;
+            switch(mode)
+            {
+            case migraphx::op::pooling_mode::average:
+                calc_pooling<type>(output_shape, output, input, avg_pool{});
+                break;
+            case migraphx::op::pooling_mode::max:
+                calc_pooling<type>(output_shape, output, input, max_pool{});
+                break;
+            case migraphx::op::pooling_mode::lpnorm:
+                calc_pooling<type>(output_shape, output, input, lpnorm_pool{lp_order});
+                break;
+            }
+        });
+
+        return result;
+    }
 };

 } // namespace op

--- a/src/include/migraphx/op/prefix_scan_op.hpp
+++ b/src/include/migraphx/op/prefix_scan_op.hpp
@@ -38,18 +38,38 @@ struct prefix_scan_op : op_name<Derived>
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
        check_shapes{inputs, *this}.has(1);
-        return inputs.at(0);
+        auto s = inputs.front();
+        if(s.broadcasted())
+        {
+            return {s.type(), s.lens()};
+        }
+        else
+        {
+            return s.with_lens(s.lens());
+        }
    }

-    argument compute(const shape&, std::vector<argument> args) const
+    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
-        argument result = args[0].copy();
-        auto s          = result.get_shape();
-        auto slice      = shape{s.type(), {s.lens()[axis]}, {s.strides()[axis]}};
-        auto lens       = s.lens();
-        lens[axis]      = 1;
-        auto batch      = shape{s.type(), lens, s.strides()};
-        auto& self      = static_cast<const Derived&>(*this);
+        argument result{output_shape};
+        auto s = args[0].get_shape();
+        if(s == output_shape)
+        {
+            result = args[0].copy();
+        }
+        else
+        {
+            visit_all(result, args[0])([&](auto output, auto input) {
+                par_for(output_shape.elements(),
+                        [&](auto i) { output[output_shape.index(i)] = input[s.index(i)]; });
+            });
+            s = output_shape;
+        }
+        auto slice = shape{s.type(), {s.lens()[axis]}, {s.strides()[axis]}};
+        auto lens  = s.lens();
+        lens[axis] = 1;
+        auto batch = shape{s.type(), lens, s.strides()};
+        auto& self = static_cast<const Derived&>(*this);
        result.visit([&](auto output) {
            using type = decltype(output);
            par_for(batch.elements(), [&](auto i) {

--- a/src/include/migraphx/op/prelu.hpp
+++ b/src/include/migraphx/op/prelu.hpp
@@ -9,6 +9,7 @@ namespace op {

 struct prelu : binary<prelu>
 {
+    std::string point_op() const { return "(${0} < 0) ? (${0} * ${1}) : ${0}"; }
    auto apply() const
    {
        return [](auto x, auto slope) { return ((x < 0) ? (x * slope) : x); };

--- a/src/include/migraphx/op/quant_dot.hpp
+++ b/src/include/migraphx/op/quant_dot.hpp
@@ -18,21 +18,12 @@ namespace op {

 struct quant_dot
 {
-    int32_t alpha = 1;
-    int32_t beta  = 1;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return pack(f(self.alpha, "alpha"), f(self.beta, "beta"));
-    }
-
    value attributes() const { return {{"general_data_type", "dot"}}; }

    std::string name() const { return "quant_dot"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type();
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type().has(2);
        const shape& a = inputs.at(0);
        const shape& b = inputs.at(1);
        auto t         = a.type();
@@ -64,18 +55,6 @@ struct quant_dot

        auto out_lens   = a.lens();
        out_lens[dim_1] = b.lens()[dim_1];
-        if(inputs.size() == 3 && out_lens != inputs.at(2).lens())
-        {
-            MIGRAPHX_THROW("QUANT_DOT: dimension mismatch, operand C: {" +
-                           to_string_range(inputs.at(2).lens()) +
-                           "}, cannot add to operand A * B: {" + to_string_range(out_lens) + "}");
-        }
-
-        if(inputs.size() == 3 && inputs.at(2).type() != shape::int32_type)
-        {
-            MIGRAPHX_THROW("QUANT_DOT: operand C type must be int32");
-        }
-
        return {shape::int32_type, out_lens};
    }
 };

--- a/src/include/migraphx/op/quantizelinear.hpp
+++ b/src/include/migraphx/op/quantizelinear.hpp
@@ -25,6 +25,7 @@ struct quantizelinear
    std::string name() const { return "quantizelinear"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
+        check_shapes{inputs, *this}.same_dims();
        if(inputs.size() == 3)
        {
            return {inputs[2].type(), inputs[0].lens(), inputs[0].strides()};
@@ -36,7 +37,7 @@ struct quantizelinear
    {
        auto x       = args.at(0);
        auto y_scale = args.at(1);
-        std::vector<int8_t> zeros(output_shape.elements(), 0);
+        std::vector<int8_t> zeros(output_shape.bytes(), 0);
        argument y_zero_point{output_shape, zeros.data()};
        if(args.size() == 3)
        {

--- a/src/include/migraphx/op/recip.hpp
+++ b/src/include/migraphx/op/recip.hpp
@@ -9,6 +9,7 @@ namespace op {

 struct recip : unary<recip>
 {
+    std::string point_op() const { return "1 / ${0}"; }
    auto apply() const
    {
        return [](auto x) { return 1 / x; };

--- a/src/include/migraphx/op/reshape.hpp
+++ b/src/include/migraphx/op/reshape.hpp
@@ -9,6 +9,7 @@
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/lifetime.hpp>
+#include <migraphx/value.hpp>
 #include <cmath>
 #include <utility>

@@ -26,6 +27,8 @@ struct reshape
        return pack(f(self.dims, "dims"));
    }

+    value attributes() const { return {{"require_std_shape", true}}; }
+
    std::string name() const { return "reshape"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -72,7 +75,6 @@ struct reshape
        return args[0].reshape(output_shape);
    }

-    lifetime get_lifetime() const { return lifetime::borrow; }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };


--- a/src/include/migraphx/op/rnn_variable_seq_lens.hpp
+++ b/src/include/migraphx/op/rnn_variable_seq_lens.hpp
@@ -37,7 +37,7 @@ struct rnn_var_sl_shift_output
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        int64_t max_len = static_cast<int64_t>(output_shape.lens()[0]);
+        int64_t max_len = output_shape.lens()[0];
        visit_all(result, args[0])([&](auto output, auto input) {
            using value_type = typename decltype(output)::value_type;
            args[1].visit([&](auto seq_lens) {
@@ -76,7 +76,7 @@ struct rnn_var_sl_shift_sequence
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        int64_t max_len = static_cast<int64_t>(output_shape.lens()[0]);
+        int64_t max_len = output_shape.lens()[0];
        visit_all(result, args[0])([&](auto output, auto input) {
            using value_type = typename decltype(output)::value_type;
            args[1].visit([&](auto seq_lens) {