Merge branch 'develop' into type-string-driver

1b098fd7 · Paul Fultz II · GitHub · 05f2ee1c · c0398ded · 1b098fd7
Unverified Commit 1b098fd7 authored Jun 21, 2022 by Paul Fultz II Committed by GitHub Jun 21, 2022
20 changed files
--- a/src/include/migraphx/op/prefix_scan_sum.hpp
+++ b/src/include/migraphx/op/prefix_scan_sum.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCAN_INCLUSIVE_SUM_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCAN_INCLUSIVE_SUM_HPP
+#include <migraphx/op/name.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/op/prefix_scan_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct prefix_scan_sum : prefix_scan_op<prefix_scan_sum>
+{
+    prefix_scan_sum() {}
+    prefix_scan_sum(int64_t ax) : prefix_scan_op(ax) {}
+    prefix_scan_sum(int64_t ax, bool excl) : prefix_scan_op(ax, excl) {}
+    prefix_scan_sum(int64_t ax, bool excl, bool rev) : prefix_scan_op(ax, excl, rev) {}
+    auto op() const
+    {
+        return [](auto x, auto y) { return x + y; };
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/prelu.hpp
+++ b/src/include/migraphx/op/prelu.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_PRELU_HPP
+#define MIGRAPHX_GUARD_OPERATORS_PRELU_HPP
+#include <migraphx/op/binary.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct prelu : binary<prelu>
+{
+    std::string point_op() const { return "(${0} < 0) ? (${0} * ${1}) : ${0}"; }
+    auto apply() const
+    {
+        return [](auto x, auto slope) { return ((x < 0) ? (x * slope) : x); };
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/quant_convolution.hpp
+++ b/src/include/migraphx/op/quant_convolution.hpp
@@ -3,12 +3,12 @@
 #include <array>
 #include <migraphx/op/common.hpp>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
 #include <migraphx/shape_for_each.hpp>
+#include <migraphx/value.hpp>
 #include <migraphx/config.hpp>
 #include <cmath>
 #include <utility>
@@ -19,9 +19,9 @@ namespace op {
 struct quant_convolution
 {
-    std::array<std::size_t, 2> padding  = {{0, 0}};
+    std::vector<std::size_t> padding  = {0, 0};
-    std::array<std::size_t, 2> stride   = {{1, 1}};
+    std::vector<std::size_t> stride   = {1, 1};
-    std::array<std::size_t, 2> dilation = {{1, 1}};
+    std::vector<std::size_t> dilation = {1, 1};
    padding_mode_t padding_mode = default_;
    int group                   = 1;
@@ -36,14 +36,35 @@ struct quant_convolution
                    f(self.group, "group"));
    }
+    value attributes() const
+    {
+        return {{"general_data_type", "convolution"}, {"normalize_padding", "padding"}};
+    }
    std::string name() const { return "quant_convolution"; }
-    shape compute_shape(std::vector<shape> inputs) const
+    void check_attribute_size() const
+    {
+        if(not((padding.size() == stride.size() or (padding.size() / 2) == stride.size()) and
+               stride.size() == dilation.size()))
+        {
+            MIGRAPHX_THROW("QUANT_CONVOLUTION: inconsistent attribute sizes");
+        }
+    }
+    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2).same_type().same_ndims().only_dims(4);
+        check_shapes{inputs, *this}.has(2).same_type().same_ndims().min_ndims(3);
+        check_attribute_size();
        const shape& input   = inputs.at(0);
        const shape& weights = inputs.at(1);
        auto t               = input.type();
+        size_t kdims         = input.lens().size() - 2;
+        if(kdims != this->kdims())
+        {
+            MIGRAPHX_THROW("quant_convolution: input k-dims does not match attribute size");
+        }
        // all input type must be int8_type and output is float_type
        if(t != shape::int8_type)
@@ -52,23 +73,28 @@ struct quant_convolution
        }
        t = shape::int32_type;
-        return {t,
+        std::vector<size_t> output_lens{input.lens()[0], weights.lens()[0]};
-                {
+        auto padding_size = padding.size();
-                    input.lens()[0],
+        for(size_t i = 0; i < kdims; i++)
-                    weights.lens()[0],
+        {
-                    std::size_t(std::max<std::ptrdiff_t>(
+            auto padding_factor = 2 * padding[i];
-                        1,
+            if(padding_size == 2 * kdims)
-                        (input.lens()[2] - (1 + dilation[0] * (weights.lens()[2] - 1)) +
+                padding_factor = padding[i] + padding[i + kdims];
-                         2 * padding[0]) /
+            output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(
-                                stride[0] +
+                1,
-                            1)),
+                (input.lens()[i + 2] - (1 + dilation[i] * (weights.lens()[i + 2] - 1)) +
-                    std::size_t(std::max<std::ptrdiff_t>(
+                 padding_factor) /
-                        1,
+                        stride[i] +
-                        (input.lens()[3] - (1 + dilation[1] * (weights.lens()[3] - 1)) +
+                    1)));
-                         2 * padding[1]) /
+        }
-                                stride[1] +
-                            1)),
+        return inputs[0].with_lens(t, output_lens);
-                }};
+    }
+    size_t kdims() const
+    {
+        check_attribute_size();
+        return stride.size();
    }
 };

--- a/src/include/migraphx/op/quant_dot.hpp
+++ b/src/include/migraphx/op/quant_dot.hpp
@@ -2,13 +2,13 @@
 #define MIGRAPHX_GUARD_OPERATORS_QUANT_DOT_HPP
 #include <array>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
 #include <cmath>
 #include <utility>
@@ -18,19 +18,12 @@ namespace op {
 struct quant_dot
 {
-    int32_t alpha = 1;
+    value attributes() const { return {{"general_data_type", "dot"}}; }
-    int32_t beta  = 1;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return pack(f(as_number(self.alpha), "alpha"), f(as_number(self.beta), "beta"));
-    }
    std::string name() const { return "quant_dot"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type();
+        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.same_type().has(2);
        const shape& a = inputs.at(0);
        const shape& b = inputs.at(1);
        auto t         = a.type();
@@ -60,27 +53,8 @@ struct quant_dot
                           to_string_range(a.lens()) + "} x {" + to_string_range(b.lens()) + "}");
        }
-        // k be multiple of 4
-        if((a.lens()[dim_1] % 4) != 0)
-        {
-            MIGRAPHX_THROW("QUANT_DOT: size of A {" + to_string_range(a.lens()) + "} and B {" +
-                           to_string_range(b.lens()) + "} must be multiple of 4 for int8 type");
-        }
        auto out_lens   = a.lens();
        out_lens[dim_1] = b.lens()[dim_1];
-        if(inputs.size() == 3 && out_lens != inputs.at(2).lens())
-        {
-            MIGRAPHX_THROW("QUANT_DOT: dimension mismatch, operand C: {" +
-                           to_string_range(inputs.at(2).lens()) +
-                           "}, cannot add to operand A * B: {" + to_string_range(out_lens) + "}");
-        }
-        if(inputs.size() == 3 && inputs.at(2).type() != shape::int32_type)
-        {
-            MIGRAPHX_THROW("QUANT_DOT: operand C type must be int32");
-        }
        return {shape::int32_type, out_lens};
    }
 };

--- a/src/include/migraphx/op/quantizelinear.hpp
+++ b/src/include/migraphx/op/quantizelinear.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_QUANTIZE_LINEAR_HPP
+#define MIGRAPHX_GUARD_OPERATORS_QUANTIZE_LINEAR_HPP
+#include <array>
+#include <migraphx/op/common.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
+#include <migraphx/tune_axis.hpp>
+#include <cmath>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct quantizelinear
+{
+    std::string name() const { return "quantizelinear"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.same_dims();
+        if(inputs.size() == 3)
+        {
+            return {inputs[2].type(), inputs[0].lens(), inputs[0].strides()};
+        }
+        return {shape::uint8_type, inputs[0].lens(), inputs[0].strides()};
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        auto x       = args.at(0);
+        auto y_scale = args.at(1);
+        std::vector<int8_t> zeros(output_shape.bytes(), 0);
+        argument y_zero_point{output_shape, zeros.data()};
+        if(args.size() == 3)
+        {
+            y_zero_point = args.at(2);
+        }
+        argument result{output_shape};
+        visit_all(result, y_zero_point)([&](auto output, auto zero_pts) {
+            x.visit([&](auto input) {
+                y_scale.visit([&](auto scales) {
+                    using quant_type = typename decltype(output)::value_type;
+                    auto min_value   = std::numeric_limits<quant_type>::min();
+                    auto max_value   = std::numeric_limits<quant_type>::max();
+                    par_for(output_shape.elements(), [&](auto i) {
+                        int64_t quantized = static_cast<int64_t>(std::round(input[i] / scales[i])) +
+                                            static_cast<int64_t>(zero_pts[i]);
+                        output[i] = std::max(static_cast<int64_t>(min_value),
+                                             std::min(static_cast<int64_t>(max_value), quantized));
+                    });
+                });
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/recip.hpp
+++ b/src/include/migraphx/op/recip.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_RECIP_HPP
+#define MIGRAPHX_GUARD_OPERATORS_RECIP_HPP
+#include <migraphx/op/unary.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct recip : unary<recip>
+{
+    std::string point_op() const { return "1 / ${0}"; }
+    auto apply() const
+    {
+        return [](auto x) { return 1 / x; };
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/reduce_mean.hpp
+++ b/src/include/migraphx/op/reduce_mean.hpp
@@ -14,12 +14,12 @@ struct reduce_mean : reduce_op<reduce_mean>
    auto op() const
    {
-        return [=](auto x, auto y) { return x + y; };
+        return [](auto x, auto y) { return x + y; };
    }
    auto output(const shape& s) const
    {
-        return [&](auto val) { return val / s.elements(); };
+        return [&](auto val) { return val / static_cast<decltype(val)>(s.elements()); };
    }
 };

--- a/src/include/migraphx/op/reduce_op.hpp
+++ b/src/include/migraphx/op/reduce_op.hpp
@@ -7,6 +7,8 @@
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/par_for.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
 #include <vector>
 namespace migraphx {
@@ -40,6 +42,15 @@ struct zero
    }
 };
+struct one
+{
+    template <class T>
+    operator T() const
+    {
+        return T{1};
+    }
+};
 template <class Derived>
 struct reduce_op : op_name<Derived>
 {
@@ -51,6 +62,13 @@ struct reduce_op : op_name<Derived>
        return pack(f(self.axes, "axes"));
    }
+    value attributes() const
+    {
+        value normalize;
+        normalize["axes"] = value::array{normalize_attribute::include_min};
+        return {{"normalize_axes", normalize}};
+    }
    std::vector<int64_t> tune_axes(std::size_t n_dim) const
    {
        auto tuned_axes = axes;
@@ -59,26 +77,11 @@ struct reduce_op : op_name<Derived>
            tuned_axes.resize(n_dim);
            std::iota(tuned_axes.begin(), tuned_axes.end(), 0);
        }
-        else
-        {
-            for(auto& axis : tuned_axes)
-            {
-                int64_t s_dim = static_cast<int64_t>(n_dim);
-                if(axis >= s_dim or axis < -s_dim)
-                {
-                    MIGRAPHX_THROW("REDUCE_OP: axis out of range");
-                }
-                if(axis < 0)
-                {
-                    axis += n_dim;
-                }
-            }
-        }
        return tuned_axes;
    }
-    shape compute_shape(std::vector<shape> inputs) const
+    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
        check_shapes{inputs, *this}.has(1);
        auto s          = inputs.at(0);
@@ -89,7 +92,7 @@ struct reduce_op : op_name<Derived>
            lens[axis] = 1;
        }
-        return {s.type(), lens};
+        return inputs[0].with_lens(lens);
    }
    template <class T>
@@ -110,13 +113,14 @@ struct reduce_op : op_name<Derived>
                std::vector<std::size_t>& out_idx,
                tensor_view<T>& output) const
    {
-        auto data_idx = out_idx;
+        using accumulator = accumulator_type<T>;
-        T val         = static_cast<const Derived&>(*this).init();
+        auto& self        = static_cast<const Derived&>(*this);
+        auto data_idx     = out_idx;
+        accumulator val   = self.init();
        shape_for_each(batch_shape, [&](auto b_idx) {
            this->tune_dims(tuned_axes, b_idx, data_idx);
-            val = static_cast<const Derived&>(*this).op()(
+            accumulator x = input(data_idx.begin(), data_idx.end());
-                static_cast<const Derived&>(*this).input()(input(data_idx.begin(), data_idx.end())),
+            val           = self.op()(accumulator{self.input()(x)}, val);
-                val);
        });
        output(out_idx.begin(), out_idx.end()) =
@@ -145,12 +149,12 @@ struct reduce_op : op_name<Derived>
    auto input() const
    {
-        return [&](auto val) { return val; };
+        return [](auto val) { return val; };
    }
    auto output(const shape&) const
    {
-        return [&](auto val) { return val; };
+        return [](auto val) { return val; };
    }
    reduce_op() {}

--- a/src/include/migraphx/op/reduce_prod.hpp
+++ b/src/include/migraphx/op/reduce_prod.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_REDUCE_PROD_HPP
+#define MIGRAPHX_GUARD_OPERATORS_REDUCE_PROD_HPP
+#include <migraphx/op/reduce_op.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct reduce_prod : reduce_op<reduce_prod>
+{
+    reduce_prod() {}
+    reduce_prod(std::vector<int64_t> ax) : reduce_op(std::move(ax)) {}
+    auto op() const
+    {
+        return [=](auto x, auto y) { return x * y; };
+    }
+    auto init() const { return one(); }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/relu.hpp
+++ b/src/include/migraphx/op/relu.hpp
@@ -3,7 +3,6 @@
 #include <array>
 #include <migraphx/op/unary.hpp>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
@@ -19,6 +18,7 @@ namespace op {
 struct relu : unary<relu>
 {
+    std::string point_op() const { return "${function:max}(decltype(${0}){0}, ${0})"; }
    auto apply() const
    {
        return [](auto x) { return std::max(decltype(x){0}, x); };

--- a/src/include/migraphx/op/reshape.hpp
+++ b/src/include/migraphx/op/reshape.hpp
@@ -2,13 +2,14 @@
 #define MIGRAPHX_GUARD_OPERATORS_RESHAPE_HPP
 #include <array>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/lifetime.hpp>
+#include <migraphx/value.hpp>
 #include <cmath>
 #include <utility>
@@ -26,6 +27,8 @@ struct reshape
        return pack(f(self.dims, "dims"));
    }
+    value attributes() const { return {{"require_std_shape", true}}; }
    std::string name() const { return "reshape"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -34,7 +37,8 @@ struct reshape
        std::vector<std::size_t> rdims(dims.begin(), dims.end());
        auto n_neg_dims = std::count(dims.begin(), dims.end(), -1);
        if(n_neg_dims > 1)
-            MIGRAPHX_THROW("Dimensions for reshape can only have one -1 dim");
+            MIGRAPHX_THROW("Reshape: Dimensions for reshape can only have one -1 dim");
        for(std::size_t i = 0; i < dims.size(); i++)
        {
            if(dims[i] == 0)
@@ -45,6 +49,7 @@ struct reshape
            if(dims[i] == -1)
                rdims[i] = 1;
        }
        if(n_neg_dims > 0)
        {
            size_t missing_dim =
@@ -59,15 +64,17 @@ struct reshape
        shape s{inputs.front().type(), rdims};
        if(s.elements() != inputs.front().elements())
-            MIGRAPHX_THROW("Wrong number of elements for reshape: reshape has " +
+            MIGRAPHX_THROW("Reshape: Wrong number of elements for reshape: reshape has " +
                           std::to_string(s.elements()) + " elements whereas the input has " +
                           std::to_string(inputs.front().elements()));
        return s;
    }
    argument compute(shape output_shape, std::vector<argument> args) const
    {
-        return {std::move(output_shape), std::move(args.front().data)};
+        return args[0].reshape(output_shape);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };

--- a/src/include/migraphx/op/reverse.hpp
+++ b/src/include/migraphx/op/reverse.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_REVERSE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_REVERSE_HPP
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include <utility>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/value.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct reverse
+{
+    std::vector<int64_t> axes;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axes, "axes"));
+    }
+    std::string name() const { return "reverse"; }
+    value attributes() const
+    {
+        value normalize;
+        normalize["axes"] = value::array{normalize_attribute::include_min};
+        return {{"normalize_axes", normalize}};
+    }
+    shape normalize_compute_shape(std::vector<shape> inputs) const
+    {
+        return inputs[0].with_lens(inputs[0].lens());
+    }
+    argument compute(const shape& s, std::vector<argument> args) const
+    {
+        argument result{s};
+        auto lens = s.lens();
+        visit_all(result, args.front())([&](auto output, auto input) {
+            shape_for_each(s, [&](const auto& out_idx) {
+                auto in_idx = out_idx;
+                for(const auto& axis : axes)
+                {
+                    in_idx[axis] = lens[axis] - 1 - out_idx[axis];
+                }
+                output[s.index(out_idx)] = input[s.index(in_idx)];
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/rnn.hpp
+++ b/src/include/migraphx/op/rnn.hpp
@@ -3,8 +3,8 @@
 #include <array>
 #include <migraphx/op/common.hpp>
-#include <migraphx/op/tanh.hpp>
 #include <migraphx/operation.hpp>
+#include <migraphx/op/tanh.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>

--- a/src/include/migraphx/op/rnn_last_cell_output.hpp
+++ b/src/include/migraphx/op/rnn_last_cell_output.hpp
 #ifndef MIGRAPHX_GUARD_OPERATORS_RNN_LAST_CELL_OUTPUT_HPP
 #define MIGRAPHX_GUARD_OPERATORS_RNN_LAST_CELL_OUTPUT_HPP
-#include <array>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
-#include <migraphx/literal.hpp>
-#include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
-#include <cmath>
-#include <utility>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
-struct lstm_last_cell_output
+struct rnn_last_cell_output
 {
-    std::string name() const { return "lstm_last_cell_output"; }
+    std::string name() const { return "rnn_last_cell_output"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
        auto dims = inputs[0].lens();
        // remove the first dimension, remaing are output shape

--- a/src/include/migraphx/op/rnn_last_output.hpp
+++ b/src/include/migraphx/op/rnn_last_output.hpp
-#ifndef MIGRAPHX_GUARD_OPERATORS_RNN_LAST_OUTPUT_HPP
+#ifndef MIGRAPHX_GUARD_OPERATORS_RNN_LAST_HS_OUTPUT_HPP
-#define MIGRAPHX_GUARD_OPERATORS_RNN_LAST_OUTPUT_HPP
+#define MIGRAPHX_GUARD_OPERATORS_RNN_LAST_HS_OUTPUT_HPP
-#include <array>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
-#include <migraphx/literal.hpp>
-#include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
-#include <cmath>
-#include <utility>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {
-struct rnn_last_output
+struct rnn_last_hs_output
 {
-    std::string name() const { return "rnn_last_output"; }
+    std::string name() const { return "rnn_last_hs_output"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1);
        auto dims = inputs[0].lens();
        // remove the first dimension, remaing are output shape

--- a/src/include/migraphx/op/rnn_var_sl_last_output.hpp
+++ b/src/include/migraphx/op/rnn_var_sl_last_output.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_RNN_VAR_SL_LAST_OUTPUT_HPP
+#define MIGRAPHX_GUARD_OPERATORS_RNN_VAR_SL_LAST_OUTPUT_HPP
+#include <array>
+#include <migraphx/op/common.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/config.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct rnn_var_sl_last_output
+{
+    rnn_direction direction = rnn_direction::forward;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.direction, "direction"));
+    }
+    std::string name() const { return "rnn_var_sl_last_output"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        auto dims = inputs[0].lens();
+        // remove the first dimension, remaing are output shape
+        dims.erase(dims.begin());
+        return {inputs[0].type(), dims};
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/rnn_variable_seq_lens.hpp
+++ b/src/include/migraphx/op/rnn_variable_seq_lens.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_RNN_VARIABLE_SEQ_LENS_HPP
+#define MIGRAPHX_GUARD_OPERATORS_RNN_VARIABLE_SEQ_LENS_HPP
+#include <array>
+#include <migraphx/op/common.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/config.hpp>
+#include <cmath>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct rnn_var_sl_shift_output
+{
+    std::string output_name = "hidden_states";
+    rnn_direction direction = rnn_direction::forward;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.output_name, "output_name"), f(self.direction, "direction"));
+    }
+    std::string name() const { return "rnn_var_sl_shift_output"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        return inputs[0];
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        int64_t max_len = output_shape.lens()[0];
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using value_type = typename decltype(output)::value_type;
+            args[1].visit([&](auto seq_lens) {
+                par_for(output_shape.elements(), [&](auto i) {
+                    auto idx       = output_shape.multi(i);
+                    auto batch_id  = idx[2];
+                    auto d         = idx[1];
+                    auto t         = idx[0];
+                    auto sl        = seq_lens[batch_id];
+                    value_type val = value_type{0};
+                    if(t < sl)
+                    {
+                        auto in_idx = idx;
+                        int offset  = (direction == rnn_direction::reverse or d == 1) ? 1 : 0;
+                        in_idx[0] += offset * (max_len - sl);
+                        val = input(in_idx.begin(), in_idx.end());
+                    }
+                    output(idx.begin(), idx.end()) = val;
+                });
+            });
+        });
+        return result;
+    }
+};
+struct rnn_var_sl_shift_sequence
+{
+    std::string name() const { return "rnn_var_sl_shift_sequence"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        return inputs[0];
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        int64_t max_len = output_shape.lens()[0];
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using value_type = typename decltype(output)::value_type;
+            args[1].visit([&](auto seq_lens) {
+                par_for(output_shape.elements(), [&](auto i) {
+                    auto idx       = output_shape.multi(i);
+                    auto b         = idx[1];
+                    auto t         = idx[0];
+                    auto sl        = seq_lens[b];
+                    value_type val = value_type{0};
+                    if(t >= max_len - sl)
+                    {
+                        auto in_idx = idx;
+                        in_idx[0] -= (max_len - sl);
+                        val = input(in_idx.begin(), in_idx.end());
+                    }
+                    output(idx.begin(), idx.end()) = val;
+                });
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+#include <limits>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/op/common.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <cmath>
+#include <numeric>
+#include <utility>
+#include <vector>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+struct roialign
+{
+    std::string coord_trans_mode = "half_pixel";
+    pooling_mode mode            = {pooling_mode::average};
+    int64_t output_height        = 1;
+    int64_t output_width         = 1;
+    int64_t sampling_ratio       = 0;
+    float spatial_scale          = 1.0f;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.coord_trans_mode, "coordinate_transformation_mode"),
+                    f(self.mode, "mode"),
+                    f(self.output_height, "output_height"),
+                    f(self.output_width, "output_width"),
+                    f(self.sampling_ratio, "sampling_ratio"),
+                    f(self.spatial_scale, "spatial_scale"));
+    }
+    std::string name() const { return "roialign"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        auto x_lens   = inputs.at(0).lens();
+        auto roi_lens = inputs.at(1).lens();
+        auto bi_lens  = inputs.at(2).lens();
+        auto type     = inputs.at(0).type();
+        // check input correct
+        if(bi_lens.size() != 1)
+        {
+            MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
+        }
+        if(roi_lens.size() != 2 or roi_lens.at(1) != 4)
+        {
+            MIGRAPHX_THROW(
+                "ROIALIGN: rois should be 2 dimensions, and the second dim should be 4!");
+        }
+        if(roi_lens.front() != bi_lens.front())
+        {
+            MIGRAPHX_THROW("ROIALIGN: rois and batch indices inputs should have the same number!");
+        }
+        std::vector<std::size_t> out_lens = x_lens;
+        out_lens[0]                       = roi_lens[0];
+        out_lens[2]                       = output_height;
+        out_lens[3]                       = output_width;
+        return {type, out_lens};
+    }
+    struct pos_weight
+    {
+        // neighbor indices for the bilinear interpolation
+        std::array<std::size_t, 4> pos = {0, 0, 0, 0};
+        // neighbor weights for the bilinear interpolation
+        std::array<float, 4> w = {0.0f, 0.0f, 0.0f, 0.0f};
+    };
+    auto calc_pos_weight(const std::array<std::size_t, 2>& dims,
+                         const shape& comp_s,
+                         const std::array<float, 2>& roi_start,
+                         const std::array<float, 2>& bin_size,
+                         const std::array<std::size_t, 2>& bin_grid_size) const
+    {
+        std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
+                                        output_width);
+        shape_for_each(comp_s, [&](auto idx) {
+            std::array<std::size_t, 2> p = {idx[0], idx[1]};
+            std::array<std::size_t, 2> i = {idx[2], idx[3]};
+            auto index                   = comp_s.index(idx);
+            std::array<float, 2> xy{};
+            std::array<int64_t, 2> low{};
+            std::array<int64_t, 2> high{};
+            for(auto ii : range(p.size()))
+            {
+                xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
+                         (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
+                xy[ii] = (coord_trans_mode == "output_half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+                if(xy[ii] < -1.0 or xy[ii] > dims[ii])
+                {
+                    results[index] = pos_weight{};
+                    return;
+                }
+                xy[ii]   = std::max(xy[ii], 0.0f);
+                low[ii]  = xy[ii];
+                high[ii] = low[ii] + 1;
+                if(low[ii] >= dims[ii] - 1)
+                {
+                    xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+                }
+            }
+            results[index].pos = {low[0] * dims[1] + low[1],
+                                  low[0] * dims[1] + high[1],
+                                  high[0] * dims[1] + low[1],
+                                  high[0] * dims[1] + high[1]};
+            float ly = xy[0] - low[0];
+            float lx = xy[1] - low[1];
+            float hy = 1.0f - ly;
+            float hx = 1.0f - lx;
+            // save weights and indeces
+            results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
+        });
+        return results;
+    }
+    struct max_pool
+    {
+        double init() { return std::numeric_limits<double>::lowest(); }
+        double operator()(double x, double y) { return std::max(x, y); }
+        double final(double x, std::size_t) { return (x); }
+    };
+    struct avg_pool
+    {
+        double init() { return 0.0; }
+        double operator()(double x, double y) { return x + y; }
+        double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
+    };
+    template <class T, class Op>
+    std::tuple<double, int64_t> calc_pooling(const T& data,
+                                             const std::array<std::size_t, 2>& bin_grid_size,
+                                             const std::vector<pos_weight>& pos_weights,
+                                             int64_t index,
+                                             Op op) const
+    {
+        double output_val   = op.init();
+        const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+        dfor(bin_grid_size[0], bin_grid_size[1])([&](auto, auto) {
+            const auto& pc = pos_weights[index];
+            std::array<double, 4> wv;
+            std::transform(
+                pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
+                    return *(data + pos) * w;
+                });
+            output_val = std::accumulate(wv.begin(), wv.end(), output_val, op);
+            index += 1;
+        });
+        output_val = op.final(output_val, count);
+        return {output_val, index};
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        const auto& out_lens = output_shape.lens();
+        int64_t n_rois       = out_lens[0];
+        std::size_t channels = out_lens[1];
+        // output dims of height and width, in all 2-dim arrays, the first dim
+        // is for height and second dim is for width
+        std::array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+        const auto& x_lens                  = args.at(0).get_shape().lens();
+        // input dims of height and width
+        std::array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+        auto roi_s                         = args.at(1).get_shape();
+        visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
+            const auto* batch_indices = args.at(2).cast<int64_t>();
+            par_for(n_rois, [&](auto n) {
+                const auto bottom_data   = x.begin();
+                const auto roi_batch_ind = batch_indices[n];
+                // Do not using rounding; this implementation detail is critical
+                std::array<float, 2> roi_starts = {
+                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale)};
+                std::array<float, 2> roi_ends = {
+                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale)};
+                // Force malformed ROIs to be 1x1
+                std::array<float, 2> roi_size{};
+                std::array<float, 2> bin_size{};
+                std::array<std::size_t, 2> bin_grid_size{};
+                for(auto ii : range(roi_size.size()))
+                {
+                    roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+                    roi_size[ii] = std::max(roi_size[ii], 1.0f);
+                    bin_size[ii]      = roi_size[ii] / out_dims[ii];
+                    bin_grid_size[ii] = (sampling_ratio > 0)
+                                            ? sampling_ratio
+                                            : std::ceil(roi_size[ii] / out_dims[ii]);
+                }
+                // we want to precalculate indices and weights shared by all channels,
+                // this is the key point of optimization
+                std::vector<std::size_t> comp_lens = {
+                    out_dims[0], out_dims[1], bin_grid_size[0], bin_grid_size[1]};
+                shape comp_s{shape::float_type, comp_lens};
+                auto pre_calc =
+                    this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
+                std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
+                shape comp_s1{migraphx::shape::float_type, comp_lens1};
+                std::vector<int64_t> vec_index(channels, 0);
+                shape_for_each(comp_s1, [&](auto idx) {
+                    auto c  = idx[0];
+                    auto ph = idx[1];
+                    auto pw = idx[2];
+                    const auto offset_bottom_data =
+                        bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
+                                                           in_dims[0] * in_dims[1]);
+                    double output_val;
+                    std::tie(output_val, vec_index[c]) =
+                        (mode == migraphx::op::pooling_mode::average)
+                            ? this->calc_pooling(offset_bottom_data,
+                                                 bin_grid_size,
+                                                 pre_calc,
+                                                 vec_index[c],
+                                                 avg_pool{})
+                            : this->calc_pooling(offset_bottom_data,
+                                                 bin_grid_size,
+                                                 pre_calc,
+                                                 vec_index[c],
+                                                 max_pool{});
+                    output(n, c, ph, pw) = output_val;
+                });
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/include/migraphx/op/scalar.hpp
+++ b/src/include/migraphx/op/scalar.hpp
@@ -2,13 +2,13 @@
 #define MIGRAPHX_GUARD_OPERATORS_SCALAR_HPP
 #include <array>
-#include <migraphx/operation.hpp>
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/literal.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/lifetime.hpp>
 #include <cmath>
 #include <utility>
@@ -30,7 +30,7 @@ struct scalar
    shape compute_shape(std::vector<shape> inputs) const
    {
-        assert(check_shapes{inputs}.has(1).only_dims(1).size() == 1);
+        check_shapes{inputs, *this}.has(1).only_dims(1).nelements(1);
        auto t = inputs.at(0).type();
        std::vector<std::size_t> strides(scalar_bcast_lens.size(), 0);
        return {t, scalar_bcast_lens, strides};
@@ -38,7 +38,7 @@ struct scalar
    argument compute(shape output_shape, std::vector<argument> args) const
    {
-        return {std::move(output_shape), std::move(args.at(0).data)};
+        return args[0].reshape(output_shape);
    }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };

--- a/src/include/migraphx/op/scatter.hpp
+++ b/src/include/migraphx/op/scatter.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTER_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTER_HPP
+#include <array>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/op/name.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
+#include <cmath>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+// The scatter operator fetches a subset of data given by an index array and then performs a
+// reduction operation (add, multiply, or just set the data) on each element returned.  We implement
+// it as a separate derived struct for each of the three reduction methods.  The related operator
+// scatterND is a generalization that works on a set of 3 tensors of different ranks.  The
+// complementary operations are gather/gatherND.
+//
+// This is a template for deriving child structs from.  Each child needs to define
+// only a reduction() method.  Names are automatically handled by the op_name template.
+template <class Derived>
+struct scatter : op_name<Derived>
+{
+    int64_t axis = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axis, "axis"));
+    }
+    value attributes() const
+    {
+        value normalize;
+        normalize["axis"] = value::array{normalize_attribute::include_min};
+        return {{"normalize_axes", normalize}};
+    }
+    shape normalize_compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3).standard();
+        // If non-packed, this converts to a packed output while preserving permutation of tensor
+        return inputs.front().with_lens(inputs.front().lens());
+    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto& self = static_cast<const Derived&>(*this);
+        // max dimension in each axis
+        auto axis_dim_size = output_shape.lens()[axis];
+        // cast all arguments as correct type
+        visit_all(result, args[0], args[2])([&](auto output, auto data, auto update) {
+            // copy all of data to output
+            std::copy(data.begin(), data.end(), output.begin());
+            args[1].visit([&](auto indices) {
+                auto ind_s = indices.get_shape();
+                // iterate through items in shape
+                shape_for_each(ind_s, [&](const auto& idx) {
+                    auto out_idx = idx;
+                    // Overloaded tensor_view::() invokes indexing logic of
+                    // std::size_t shape::index(std::size_t i) const
+                    // which handles nonstandard shapes correctly
+                    auto index = indices(idx.begin(), idx.end());
+                    // normalize negative indexes (may be redundant after using
+                    // normalize_compute_shape())
+                    index         = (index < 0) ? index + axis_dim_size : index;
+                    out_idx[axis] = index;
+                    // look up the appropriate locations in output, using idx and out_idx.
+                    // call reduction() method of derived struct to copy and reduce that element
+                    self.reduction()(output(out_idx.begin(), out_idx.end()),
+                                     update(idx.begin(), idx.end()));
+                });
+            });
+        });
+        return result;
+    }
+};
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif