Merge

7e297b13 · Paul · 86ea5e91 · aa7ff911 · 7e297b13 · 7e297b13
Commit 7e297b13 authored Jun 13, 2022 by Paul
20 changed files
--- a/src/include/migraphx/op/roialign.hpp
+++ b/src/include/migraphx/op/roialign.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_OPERATORS_ROIALIGN_HPP
+
+#include <limits>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/op/common.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <cmath>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct roialign
+{
+    std::string coord_trans_mode = "half_pixel";
+    pooling_mode mode            = {pooling_mode::average};
+    int64_t output_height        = 1;
+    int64_t output_width         = 1;
+    int64_t sampling_ratio       = 0;
+    float spatial_scale          = 1.0f;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.coord_trans_mode, "coordinate_transformation_mode"),
+                    f(self.mode, "mode"),
+                    f(self.output_height, "output_height"),
+                    f(self.output_width, "output_width"),
+                    f(self.sampling_ratio, "sampling_ratio"),
+                    f(self.spatial_scale, "spatial_scale"));
+    }
+
+    std::string name() const { return "roialign"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        auto x_lens   = inputs.at(0).lens();
+        auto roi_lens = inputs.at(1).lens();
+        auto bi_lens  = inputs.at(2).lens();
+        auto type     = inputs.at(0).type();
+
+        // check input correct
+        if(bi_lens.size() != 1)
+        {
+            MIGRAPHX_THROW("ROIALIGN: batch indices should be 1 dimension!");
+        }
+
+        if(roi_lens.size() != 2 or roi_lens.at(1) != 4)
+        {
+            MIGRAPHX_THROW(
+                "ROIALIGN: rois should be 2 dimensions, and the second dim should be 4!");
+        }
+
+        if(roi_lens.front() != bi_lens.front())
+        {
+            MIGRAPHX_THROW("ROIALIGN: rois and batch indices inputs should have the same number!");
+        }
+
+        std::vector<std::size_t> out_lens = x_lens;
+        out_lens[0]                       = roi_lens[0];
+        out_lens[2]                       = output_height;
+        out_lens[3]                       = output_width;
+
+        return {type, out_lens};
+    }
+
+    struct pos_weight
+    {
+        // neighbor indices for the bilinear interpolation
+        std::array<std::size_t, 4> pos = {0, 0, 0, 0};
+        // neighbor weights for the bilinear interpolation
+        std::array<float, 4> w = {0.0f, 0.0f, 0.0f, 0.0f};
+    };
+
+    auto calc_pos_weight(const std::array<std::size_t, 2>& dims,
+                         const shape& comp_s,
+                         const std::array<float, 2>& roi_start,
+                         const std::array<float, 2>& bin_size,
+                         const std::array<std::size_t, 2>& bin_grid_size) const
+    {
+        std::vector<pos_weight> results(bin_grid_size[0] * bin_grid_size[1] * output_height *
+                                        output_width);
+        shape_for_each(comp_s, [&](auto idx) {
+            std::array<std::size_t, 2> p = {idx[0], idx[1]};
+            std::array<std::size_t, 2> i = {idx[2], idx[3]};
+            auto index                   = comp_s.index(idx);
+
+            std::array<float, 2> xy{};
+            std::array<int64_t, 2> low{};
+            std::array<int64_t, 2> high{};
+            for(auto ii : range(p.size()))
+            {
+                xy[ii] = roi_start[ii] + p[ii] * bin_size[ii] +
+                         (i[ii] + .5f) * bin_size[ii] / bin_grid_size[ii];
+                xy[ii] = (coord_trans_mode == "output_half_pixel") ? (xy[ii] - 0.5f) : xy[ii];
+                if(xy[ii] < -1.0 or xy[ii] > dims[ii])
+                {
+                    results[index] = pos_weight{};
+                    return;
+                }
+
+                xy[ii]   = std::max(xy[ii], 0.0f);
+                low[ii]  = xy[ii];
+                high[ii] = low[ii] + 1;
+                if(low[ii] >= dims[ii] - 1)
+                {
+                    xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+                }
+            }
+
+            results[index].pos = {low[0] * dims[1] + low[1],
+                                  low[0] * dims[1] + high[1],
+                                  high[0] * dims[1] + low[1],
+                                  high[0] * dims[1] + high[1]};
+
+            float ly = xy[0] - low[0];
+            float lx = xy[1] - low[1];
+            float hy = 1.0f - ly;
+            float hx = 1.0f - lx;
+
+            // save weights and indeces
+            results[index].w = {hy * hx, hy * lx, ly * hx, ly * lx};
+        });
+
+        return results;
+    }
+
+    struct max_pool
+    {
+        double init() { return std::numeric_limits<double>::lowest(); }
+
+        double operator()(double x, double y) { return std::max(x, y); }
+
+        double final(double x, std::size_t) { return (x); }
+    };
+
+    struct avg_pool
+    {
+        double init() { return 0.0; }
+
+        double operator()(double x, double y) { return x + y; }
+
+        double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
+    };
+
+    template <class T, class Op>
+    std::tuple<double, int64_t> calc_pooling(const T& data,
+                                             const std::array<std::size_t, 2>& bin_grid_size,
+                                             const std::vector<pos_weight>& pos_weights,
+                                             int64_t index,
+                                             Op op) const
+    {
+        double output_val   = op.init();
+        const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+        dfor(bin_grid_size[0], bin_grid_size[1])([&](auto, auto) {
+            const auto& pc = pos_weights[index];
+            std::array<double, 4> wv;
+            std::transform(
+                pc.w.begin(), pc.w.end(), pc.pos.begin(), wv.begin(), [&](auto w, auto pos) {
+                    return *(data + pos) * w;
+                });
+            output_val = std::accumulate(wv.begin(), wv.end(), output_val, op);
+            index += 1;
+        });
+
+        output_val = op.final(output_val, count);
+
+        return {output_val, index};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        const auto& out_lens = output_shape.lens();
+        int64_t n_rois       = out_lens[0];
+        std::size_t channels = out_lens[1];
+        // output dims of height and width, in all 2-dim arrays, the first dim
+        // is for height and second dim is for width
+        std::array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+        const auto& x_lens                  = args.at(0).get_shape().lens();
+        // input dims of height and width
+        std::array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+        auto roi_s                         = args.at(1).get_shape();
+
+        visit_all(result, args.at(0), args.at(1))([&](auto output, auto x, auto roi) {
+            const auto* batch_indices = args.at(2).cast<int64_t>();
+            par_for(n_rois, [&](auto n) {
+                const auto bottom_data   = x.begin();
+                const auto roi_batch_ind = batch_indices[n];
+                // Do not using rounding; this implementation detail is critical
+                std::array<float, 2> roi_starts = {
+                    static_cast<float>(roi[roi_s.index({n, 1})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 0})] * spatial_scale)};
+                std::array<float, 2> roi_ends = {
+                    static_cast<float>(roi[roi_s.index({n, 3})] * spatial_scale),
+                    static_cast<float>(roi[roi_s.index({n, 2})] * spatial_scale)};
+
+                // Force malformed ROIs to be 1x1
+                std::array<float, 2> roi_size{};
+                std::array<float, 2> bin_size{};
+                std::array<std::size_t, 2> bin_grid_size{};
+
+                for(auto ii : range(roi_size.size()))
+                {
+                    roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+                    roi_size[ii] = std::max(roi_size[ii], 1.0f);
+
+                    bin_size[ii]      = roi_size[ii] / out_dims[ii];
+                    bin_grid_size[ii] = (sampling_ratio > 0)
+                                            ? sampling_ratio
+                                            : std::ceil(roi_size[ii] / out_dims[ii]);
+                }
+
+                // we want to precalculate indices and weights shared by all channels,
+                // this is the key point of optimization
+                std::vector<std::size_t> comp_lens = {
+                    out_dims[0], out_dims[1], bin_grid_size[0], bin_grid_size[1]};
+                shape comp_s{shape::float_type, comp_lens};
+                auto pre_calc =
+                    this->calc_pos_weight(in_dims, comp_s, roi_starts, bin_size, bin_grid_size);
+
+                std::vector<std::size_t> comp_lens1 = {channels, out_dims[0], out_dims[1]};
+                shape comp_s1{migraphx::shape::float_type, comp_lens1};
+                std::vector<int64_t> vec_index(channels, 0);
+                shape_for_each(comp_s1, [&](auto idx) {
+                    auto c  = idx[0];
+                    auto ph = idx[1];
+                    auto pw = idx[2];
+
+                    const auto offset_bottom_data =
+                        bottom_data + static_cast<int64_t>((roi_batch_ind * channels + c) *
+                                                           in_dims[0] * in_dims[1]);
+                    double output_val;
+                    std::tie(output_val, vec_index[c]) =
+                        (mode == migraphx::op::pooling_mode::average)
+                            ? this->calc_pooling(offset_bottom_data,
+                                                 bin_grid_size,
+                                                 pre_calc,
+                                                 vec_index[c],
+                                                 avg_pool{})
+                            : this->calc_pooling(offset_bottom_data,
+                                                 bin_grid_size,
+                                                 pre_calc,
+                                                 vec_index[c],
+                                                 max_pool{});
+                    output(n, c, ph, pw) = output_val;
+                });
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/scalar.hpp
+++ b/src/include/migraphx/op/scalar.hpp
@@ -40,7 +40,6 @@ struct scalar
    {
        return args[0].reshape(output_shape);
    }
-    lifetime get_lifetime() const { return lifetime::borrow; }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };


--- a/src/include/migraphx/op/scatter.hpp
+++ b/src/include/migraphx/op/scatter.hpp
@@ -8,6 +8,7 @@
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/value.hpp>
+#include <migraphx/op/name.hpp>
 #include <migraphx/op/normalize_attribute.hpp>
 #include <cmath>
 #include <utility>
@@ -16,7 +17,17 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace op {

-struct scatter
+// The scatter operator fetches a subset of data given by an index array and then performs a
+// reduction operation (add, multiply, or just set the data) on each element returned.  We implement
+// it as a separate derived struct for each of the three reduction methods.  The related operator
+// scatterND is a generalization that works on a set of 3 tensors of different ranks.  The
+// complementary operations are gather/gatherND.
+//
+// This is a template for deriving child structs from.  Each child needs to define
+// only a reduction() method.  Names are automatically handled by the op_name template.
+
+template <class Derived>
+struct scatter : op_name<Derived>
 {
    int64_t axis = 0;

@@ -33,29 +44,44 @@ struct scatter
        return {{"normalize_axes", normalize}};
    }

-    std::string name() const { return "scatter"; }
-
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
        check_shapes{inputs, *this}.has(3).standard();
-        return inputs.front();
+        // If non-packed, this converts to a packed output while preserving permutation of tensor
+        return inputs.front().with_lens(inputs.front().lens());
    }

    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        // max dimension in axis
+        auto& self = static_cast<const Derived&>(*this);
+
+        // max dimension in each axis
        auto axis_dim_size = output_shape.lens()[axis];
+        // cast all arguments as correct type
        visit_all(result, args[0], args[2])([&](auto output, auto data, auto update) {
+            // copy all of data to output
            std::copy(data.begin(), data.end(), output.begin());
            args[1].visit([&](auto indices) {
                auto ind_s = indices.get_shape();
+                // iterate through items in shape
                shape_for_each(ind_s, [&](const auto& idx) {
-                    auto out_idx  = idx;
-                    auto index    = indices[ind_s.index(idx)];
+                    auto out_idx = idx;
+
+                    // Overloaded tensor_view::() invokes indexing logic of
+                    // std::size_t shape::index(std::size_t i) const
+                    // which handles nonstandard shapes correctly
+                    auto index = indices(idx.begin(), idx.end());
+
+                    // normalize negative indexes (may be redundant after using
+                    // normalize_compute_shape())
                    index         = (index < 0) ? index + axis_dim_size : index;
                    out_idx[axis] = index;
-                    output[output_shape.index(out_idx)] = update[ind_s.index(idx)];
+
+                    // look up the appropriate locations in output, using idx and out_idx.
+                    // call reduction() method of derived struct to copy and reduce that element
+                    self.reduction()(output(out_idx.begin(), out_idx.end()),
+                                     update(idx.begin(), idx.end()));
                });
            });
        });

--- a/src/include/migraphx/op/scatter_add.hpp
+++ b/src/include/migraphx/op/scatter_add.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTER_ADD_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTER_ADD_HPP
+
+#include <array>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
+#include <cmath>
+#include <utility>
+#include <migraphx/op/scatter.hpp>
+
+// Scatter op. with "add" function as reduction.
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct scatter_add : scatter<scatter_add>
+{
+    // reduction (pointwise operation) is called by the parent struct's compute() method.
+    // It works much like a virtual function overload.
+    // For the scatter methods, there are three different reduction functions.
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x += y; };
+    }
+
+    // name of this struct is automatically assigned by the op_name<>
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/scatter_mul.hpp
+++ b/src/include/migraphx/op/scatter_mul.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTER_MUL_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTER_MUL_HPP
+
+#include <array>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
+#include <cmath>
+#include <utility>
+#include <migraphx/op/scatter.hpp>
+
+// Scatter op. with "multiply" as the reduction function.
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct scatter_mul : scatter<scatter_mul>
+{
+    // reduction (pointwise operation) is called by the parent struct's compute() method.
+    // It works much like a virtual function overload.
+    // For the scatter operators, there are three different reduction functions.
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x *= y; };
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/scatter_none.hpp
+++ b/src/include/migraphx/op/scatter_none.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTER_NONE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTER_NONE_HPP
+
+#include <array>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
+#include <migraphx/op/scatter.hpp>
+#include <cmath>
+#include <utility>
+
+// Scatter op. with "none" as the reduction function (just copies the value).  This is identical to
+// the previously existing Scatter op.
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct scatter_none : scatter<scatter_none>
+{
+    // reduction (pointwise operation) is called by the parent struct's compute() method.
+    // It works much like a virtual function overload.
+    // For the scatter operators, there are three different reduction functions.
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x = y; };
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/scatternd_add.hpp
+++ b/src/include/migraphx/op/scatternd_add.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_ADD_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_ADD_HPP
+
+#include <migraphx/op/scatternd_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct scatternd_add : scatternd_op<scatternd_add>
+{
+    scatternd_add() {}
+
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x += y; };
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/scatternd_mul.hpp
+++ b/src/include/migraphx/op/scatternd_mul.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_MUL_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_MUL_HPP
+
+#include <migraphx/op/scatternd_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct scatternd_mul : scatternd_op<scatternd_mul>
+{
+    scatternd_mul() {}
+
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x *= y; };
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/scatternd_none.hpp
+++ b/src/include/migraphx/op/scatternd_none.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_NONE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_NONE_HPP
+
+#include <migraphx/op/scatternd_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct scatternd_none : scatternd_op<scatternd_none>
+{
+    scatternd_none() {}
+
+    auto reduction() const
+    {
+        return [](auto& x, const auto& y) { x = y; };
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/scatternd_op.hpp
+++ b/src/include/migraphx/op/scatternd_op.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_SCATTERND_OP_HPP
+#define MIGRAPHX_GUARD_OPERATORS_SCATTERND_OP_HPP
+
+#include <migraphx/op/name.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/par_for.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+template <class Derived>
+struct scatternd_op : op_name<Derived>
+{
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3);
+        auto r         = inputs.front().lens().size();
+        auto q         = inputs.at(1).lens().size();
+        auto k         = inputs.at(1).lens().back();
+        auto ind_lens  = inputs.at(1).lens();
+        auto upd_lens  = inputs.back().lens();
+        auto data_lens = inputs.front().lens();
+        if(k > r)
+            MIGRAPHX_THROW("ScatterND: index of size " + std::to_string(k) +
+                           " is too large for tensor of rank " + std::to_string(r));
+        if(not(std::equal(ind_lens.begin(), ind_lens.begin() + q - 1, upd_lens.begin()) and
+               std::equal(data_lens.begin() + k, data_lens.end(), upd_lens.begin() + q - 1)))
+            MIGRAPHX_THROW("ScatterND: incorrect update shape. update.lens != indices.lens[0:q-1] "
+                           "++ data.lens[k:r-1]");
+        auto s = inputs.front();
+        if(s.broadcasted())
+        {
+            return {s.type(), s.lens()};
+        }
+        else
+        {
+            return s.with_lens(s.lens());
+        }
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        auto& self = static_cast<const Derived&>(*this);
+        visit_all(result, args[0], args[2])([&](auto output, auto data, auto updates) {
+            std::copy(data.begin(), data.end(), output.begin());
+            args[1].visit([&](auto indices) {
+                auto updates_shape = updates.get_shape();
+                auto updates_std   = shape{updates_shape.type(), updates_shape.lens()};
+                auto indices_shape = indices.get_shape();
+                auto k             = indices_shape.lens().back();
+                auto q             = indices_shape.lens().size();
+                auto r             = output_shape.lens().size();
+                par_for(updates_shape.elements(), [&](const auto i) {
+                    auto updates_idx = updates_std.multi(i);
+                    std::vector<std::size_t> indices_idx(q, 0);
+                    std::copy(
+                        updates_idx.begin(), updates_idx.begin() + q - 1, indices_idx.begin());
+                    auto index_start = indices.begin() +
+                                       indices_shape.index(indices_idx.begin(), indices_idx.end());
+                    auto index_end = index_start + k;
+
+                    std::vector<std::size_t> out_idx(r, 0);
+                    std::copy(index_start, index_end, out_idx.begin());
+                    std::copy(updates_idx.begin() + q - 1, updates_idx.end(), out_idx.begin() + k);
+
+                    self.reduction()(output[output_shape.index(out_idx)], updates[i]);
+                });
+            });
+        });
+
+        return result;
+    }
+
+    auto init() const {}
+    scatternd_op() {}
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/sigmoid.hpp
+++ b/src/include/migraphx/op/sigmoid.hpp
@@ -18,6 +18,7 @@ namespace op {

 struct sigmoid : unary<sigmoid>
 {
+    std::string point_op() const { return "1.f / (1.f + ${function:exp}(-${0}))"; }
    auto apply() const
    {
        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };

--- a/src/include/migraphx/op/sign.hpp
+++ b/src/include/migraphx/op/sign.hpp
@@ -18,6 +18,7 @@ namespace op {

 struct sign : unary<sign>
 {
+    std::string point_op() const { return "(${0} > 0 ? 1 : ((${0} < 0) ? -1 : 0))"; }
    auto apply() const
    {
        return [](auto x) { return (x > 0 ? 1 : ((x < 0) ? -1 : 0)); };

--- a/src/include/migraphx/op/squeeze.hpp
+++ b/src/include/migraphx/op/squeeze.hpp
@@ -37,48 +37,53 @@ struct squeeze
    std::string name() const { return "squeeze"; }
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1).standard();
+        check_shapes{inputs, *this}.has(1);
        auto input_shape = inputs[0];
        auto type        = input_shape.type();
        auto old_lens    = input_shape.lens();
-
+        auto old_strides = input_shape.strides();
        if(std::any_of(axes.begin(), axes.end(), [&](auto axis) { return old_lens[axis] != 1; }))
        {
            MIGRAPHX_THROW("squeeze axis dimension should be equal to 1");
        }
        std::vector<std::size_t> new_lens;
+        std::vector<std::size_t> new_strides;
        if(axes.empty())
        {
-            std::copy_if(old_lens.begin(),
-                         old_lens.end(),
-                         std::back_inserter(new_lens),
-                         [](auto len) { return len != 1; });
+            for(auto i : range(old_lens.size()))
+            {
+                if(old_lens[i] != 1)
+                {
+                    new_lens.push_back(old_lens[i]);
+                    new_strides.push_back(old_strides[i]);
+                }
+            }
        }
        else
        {
-            for(std::size_t i = 0; i < old_lens.size(); i++)
+            for(auto i : range(old_lens.size()))
            {
                if(std::find(axes.begin(), axes.end(), i) == axes.end())
                {
                    new_lens.push_back(old_lens[i]);
+                    new_strides.push_back(old_strides[i]);
                }
            }
        }
-
        if(new_lens.empty())
        {
            return shape{type};
        }
        else
        {
-            return shape{type, new_lens};
+            return shape{type, new_lens, new_strides};
        }
    }
+
    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return args[0].reshape(output_shape);
    }
-    lifetime get_lifetime() const { return lifetime::borrow; }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };


--- a/src/include/migraphx/op/step.hpp
+++ b/src/include/migraphx/op/step.hpp
@@ -72,8 +72,6 @@ struct step
        return args[0].reshape(output_shape);
    }

-    lifetime get_lifetime() const { return lifetime::borrow; }
-
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };


--- a/src/include/migraphx/op/topk.hpp
+++ b/src/include/migraphx/op/topk.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_GATHER_HPP
+#define MIGRAPHX_GUARD_OPERATORS_GATHER_HPP
+
+#include <algorithm>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/value.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct topk
+{
+    int64_t k    = 1;
+    int64_t axis = 0;
+    bool largest = true;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.k, "k"), f(self.axis, "axis"), f(self.largest, "largest"));
+    }
+
+    value attributes() const
+    {
+        value normalize;
+        normalize["axis"] = value::array{normalize_attribute::include_min};
+        return {{"normalize_axes", normalize}};
+    }
+
+    std::string name() const { return "topk"; }
+
+    shape normalize_compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1).standard();
+        auto lens = inputs.at(0).lens();
+        auto type = inputs.at(0).type();
+
+        lens[axis] = k;
+
+        shape s_val{type, lens};
+        shape s_ind{shape::int64_type, lens};
+
+        return {{s_val, s_ind}};
+    }
+
+    template <class T, class Compare>
+    struct heap_vector
+    {
+        std::vector<T> data;
+        Compare compare;
+
+        heap_vector(const std::vector<T>& val, Compare comp) : data(val), compare(std::move(comp))
+        {
+            std::make_heap(data.begin(), data.end(), compare);
+        }
+
+        void try_push(T val)
+        {
+            if(not compare(val, data.front()))
+                return;
+
+            std::pop_heap(data.begin(), data.end(), compare);
+            data.back() = val;
+            std::push_heap(data.begin(), data.end(), compare);
+        }
+
+        std::vector<T> sort()
+        {
+            auto sorted_data = data;
+            std::sort_heap(sorted_data.begin(), sorted_data.end(), compare);
+            return sorted_data;
+        }
+    };
+
+    template <class T, class Compare>
+    heap_vector<T, Compare> make_heap(std::vector<T> val, Compare compare) const
+    {
+        return {std::move(val), std::move(compare)};
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        auto vec_ss = output_shape.sub_shapes();
+        argument res_val{vec_ss.front()};
+        argument res_ind{vec_ss.back()};
+        auto in_s      = args.front().get_shape();
+        auto out_s     = vec_ss.front();
+        auto comp_lens = in_s.lens();
+        auto axis_dim  = comp_lens[axis];
+
+        // compute shape
+        comp_lens[axis] = 1;
+        shape comp_s{in_s.type(), comp_lens};
+        visit_all(res_val, args.front())([&](auto out_val, auto input) {
+            auto* out_ind = res_ind.cast<int64_t>();
+            par_for(comp_s.elements(), [&](auto i) {
+                auto idx = comp_s.multi(i);
+                std::vector<std::size_t> indices(k);
+                std::iota(indices.begin(), indices.end(), 0);
+
+                auto comp = [&](auto i1, auto i2) {
+                    auto idx1  = idx;
+                    auto idx2  = idx;
+                    idx1[axis] = i1;
+                    idx2[axis] = i2;
+                    return this->largest
+                               ? std::greater<>{}(input[in_s.index(idx1)], input[in_s.index(idx2)])
+                               : std::less<>{}(input[in_s.index(idx1)], input[in_s.index(idx2)]);
+                };
+
+                auto hp = this->make_heap(indices, comp);
+                for(std::size_t ii = indices.size(); ii < axis_dim; ++ii)
+                {
+                    hp.try_push(ii);
+                }
+                auto sorted_indices = hp.sort();
+                auto out_idx        = idx;
+                auto in_idx         = idx;
+                for(auto j : range(sorted_indices.size()))
+                {
+                    out_idx[axis]                 = j;
+                    in_idx[axis]                  = sorted_indices[j];
+                    out_val[out_s.index(out_idx)] = input[in_s.index(in_idx)];
+                    out_ind[out_s.index(out_idx)] = sorted_indices[j];
+                }
+            });
+        });
+
+        return {{res_val, res_ind}};
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/op/transpose.hpp
+++ b/src/include/migraphx/op/transpose.hpp
@@ -21,7 +21,7 @@ struct transpose
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.dims, "dims"));
+        return pack(f(self.dims, "permutation"));
    }

    std::string name() const { return "transpose"; }
@@ -32,31 +32,23 @@ struct transpose
        auto input_lens    = input.lens();
        auto input_strides = input.strides();
        auto t             = input.type();
-        auto tuned_dims    = dims;
-        // if not perm provided, reverse the dims
-        if(tuned_dims.empty())
-        {
-            tuned_dims.resize(input_lens.size());
-            std::iota(tuned_dims.begin(), tuned_dims.end(), 0);
-            std::reverse(tuned_dims.begin(), tuned_dims.end());
-        }

-        if(tuned_dims.size() != input_lens.size())
+        if(dims.size() != input_lens.size())
        {
            MIGRAPHX_THROW("Permutation has wrong number of axes");
        }
-        std::vector<int64_t> axes(tuned_dims.size());
+        std::vector<int64_t> axes(dims.size());
        std::iota(axes.begin(), axes.end(), 0);
-        if(!std::is_permutation(axes.begin(), axes.end(), tuned_dims.begin()))
+        if(!std::is_permutation(axes.begin(), axes.end(), dims.begin()))
        {
-            MIGRAPHX_THROW("Invalid permutation");
+            MIGRAPHX_THROW("TRANSPOSE: Invalid permutation");
        }
        std::vector<size_t> output_lens(input_lens.size());
        std::vector<size_t> output_strides(input_lens.size());
        for(std::size_t i = 0; i < output_lens.size(); i++)
        {
-            output_lens[i]    = input_lens[tuned_dims[i]];
-            output_strides[i] = input_strides[tuned_dims[i]];
+            output_lens[i]    = input_lens[dims[i]];
+            output_strides[i] = input_strides[dims[i]];
        }
        return {t, output_lens, output_strides};
    }
@@ -64,7 +56,6 @@ struct transpose
    {
        return args[0].reshape(output_shape);
    }
-    lifetime get_lifetime() const { return lifetime::borrow; }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };


--- a/src/include/migraphx/op/unary.hpp
+++ b/src/include/migraphx/op/unary.hpp
@@ -41,7 +41,11 @@ struct unary : op_name<Derived>
    {
        check_shapes{inputs, static_cast<const Derived&>(*this)}.has(1);
        auto s = inputs.at(0);
-        if(s.broadcasted())
+        if(s.scalar())
+        {
+            return s;
+        }
+        else if(s.broadcasted())
        {
            return {s.type(), s.lens()};
        }
@@ -60,7 +64,6 @@ struct unary : op_name<Derived>
                               input.end(),
                               output.begin(),
                               static_cast<const Derived&>(*this).apply());
-
            });
        });
        return result;

--- a/src/include/migraphx/op/unsqueeze.hpp
+++ b/src/include/migraphx/op/unsqueeze.hpp
@@ -37,11 +37,11 @@ struct unsqueeze
    std::string name() const { return "unsqueeze"; }
    shape normalize_compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1).standard_or_scalar();
+        check_shapes{inputs, *this}.has(1);
        auto input_shape = inputs[0];
        auto type        = input_shape.type();
        auto old_lens    = input_shape.lens();
-
+        auto old_strides = input_shape.strides();
        if(input_shape.scalar())
        {
            if(old_lens.size() == 1 and old_lens.front() == 1)
@@ -53,25 +53,34 @@ struct unsqueeze
        std::size_t new_size = old_lens.size() + axes.size();

        std::vector<std::size_t> new_lens(new_size);
+        std::vector<std::size_t> new_strides(new_size);
        std::size_t p = 0;
-        for(std::size_t i = 0; i < new_size; i++)
+        for(auto i : range(new_size))
        {
            if(std::find(axes.begin(), axes.end(), i) != axes.end())
            {
                new_lens[i] = 1;
+                if(p == 0) // unsqueeze on the first axes
+                {
+                    new_strides[i] = old_lens[0] * old_strides[0];
+                }
+                else // unsqueeze on middle or last axes
+                {
+                    new_strides[i] = (p < old_strides.size()) ? old_strides[p - 1] : 1;
+                }
            }
            else
            {
-                new_lens[i] = old_lens[p++];
+                new_lens[i]    = old_lens[p];
+                new_strides[i] = old_strides[p++];
            }
        }
-        return shape{type, new_lens};
+        return shape{type, new_lens, new_strides};
    }
    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return args[0].reshape(output_shape);
    }
-    lifetime get_lifetime() const { return lifetime::borrow; }
    std::ptrdiff_t output_alias(const std::vector<shape>&) const { return 0; }
 };


--- a/src/include/migraphx/op/where.hpp
+++ b/src/include/migraphx/op/where.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_WHERE_HPP
+#define MIGRAPHX_GUARD_OPERATORS_WHERE_HPP
+
+#include <array>
+#include <migraphx/argument.hpp>
+#include <migraphx/par_for.hpp>
+#include <migraphx/check_shapes.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/streamutils.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/op/normalize_attribute.hpp>
+#include <cmath>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace op {
+
+struct where
+{
+    std::string name() const { return "where"; }
+
+    value attributes() const { return {{"pointwise", true}, {"point_op", "${0} ? ${1} : ${2}"}}; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(3).same_dims();
+        auto s1 = inputs.at(1);
+        auto s2 = inputs.at(2);
+        if(s1 == s2 and s1.packed())
+        {
+            return s1;
+        }
+        else if(s1.packed() != s2.packed())
+        {
+            return s1.packed() ? s1 : s2;
+        }
+        else if(s1.broadcasted() != s2.broadcasted())
+        {
+            return s1.broadcasted() ? s2.with_lens(s1.lens()) : s1.with_lens(s1.lens());
+        }
+        else
+        {
+            return {s1.type(), s1.lens()};
+        }
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[1], args[2])([&](auto output, const auto x, const auto y) {
+            args[0].visit([&](const auto condition) {
+                par_for(output_shape.elements(),
+                        [&](auto i) { output[i] = condition[i] ? x[i] : y[i]; });
+            });
+        });
+
+        return result;
+    }
+};
+
+} // namespace op
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -103,79 +103,69 @@ auto operator==(const T& x, const U& y) -> decltype(x.name() == y.name())
 } // namespace operation_operators

 template <class T>
-auto normalize_compute_shape_op(rank<1>, const T& x, const std::vector<shape>& inputs)
-    -> decltype(x.normalize_compute_shape(inputs))
-{
-    dependent_type<operation, T> y = x;
-    normalize_attributes(y, inputs[0].lens());
-    return any_cast<T>(y).normalize_compute_shape(inputs);
-}
-
-template <class T>
-shape normalize_compute_shape_op(rank<0>, const T& x, const std::vector<shape>&)
+auto compute_shape_op(rank<3>, const T& x, const std::vector<shape>& inputs)
+    -> decltype(x.compute_shape(inputs))
 {
-    std::string name = x.name();
-    MIGRAPHX_THROW("Shape not computable: " + name);
+    return x.compute_shape(inputs);
 }

 template <class T>
-shape normalize_compute_shape_op(const T& x, const std::vector<shape>& inputs)
+auto compute_shape_op(rank<2>, const T& x, const std::vector<shape>& inputs)
+    -> decltype(x.normalize_compute_shape(inputs))
 {
-    return normalize_compute_shape_op(rank<1>{}, x, inputs);
+    dependent_type<operation, T> y = x;
+    normalize_attributes(y, inputs[0].lens());
+    return any_cast<T>(y).normalize_compute_shape(inputs);
 }

 template <class T>
-auto compute_shape_op(rank<1>,
-                      const T& x,
-                      const std::vector<shape>& inputs,
-                      const std::vector<module_ref>& mod_args)
-    -> decltype(x.compute_shape(inputs, mod_args))
+auto compute_shape_op(rank<1>, const T& x, const std::vector<shape>& inputs)
+    -> decltype(x.compute_shape(inputs, {}))
 {
-    return x.compute_shape(inputs, mod_args);
+    return x.compute_shape(inputs, {});
 }

 template <class T>
-shape
-    compute_shape_op(rank<0>, const T& x, const std::vector<shape>&, const std::vector<module_ref>&)
+shape compute_shape_op(rank<0>, const T& x, const std::vector<shape>&)
 {
    std::string name = x.name();
    MIGRAPHX_THROW("Shape not computable: " + name);
 }

 template <class T>
-shape compute_shape_op(const T& x,
-                       const std::vector<shape>& inputs,
-                       const std::vector<module_ref>& mod_args)
+shape compute_shape_op(const T& x, const std::vector<shape>& inputs)
 {
-    return compute_shape_op(rank<1>{}, x, inputs, mod_args);
+    return compute_shape_op(rank<3>{}, x, inputs);
 }

 template <class T>
-auto normalize_compute_shape_op(rank<1>,
-                                const T& x,
-                                const std::vector<shape>& inputs,
-                                std::vector<module_ref>& mod_args)
-    -> decltype(x.normalize_compute_shape(inputs, mod_args))
+auto mod_compute_shape_op(rank<1>,
+                          const T& x,
+                          const std::vector<shape>& inputs,
+                          const std::vector<module_ref>& mod_args)
+    -> decltype(x.compute_shape(inputs, mod_args))
 {
-    return x.normalize_compute_shape(inputs, mod_args);
+    return x.compute_shape(inputs, mod_args);
 }

 template <class T>
-shape normalize_compute_shape_op(rank<0>,
-                                 const T& x,
-                                 const std::vector<shape>&,
-                                 const std::vector<module_ref>&)
+shape mod_compute_shape_op(rank<0>,
+                           const T& x,
+                           const std::vector<shape>& inputs,
+                           const std::vector<module_ref>& mod_args)
 {
+    if(mod_args.empty())
+        return compute_shape_op(x, inputs);
    std::string name = x.name();
    MIGRAPHX_THROW("Shape not computable: " + name);
 }

 template <class T>
-shape normalize_compute_shape_op(const T& x,
-                                 const std::vector<shape>& inputs,
-                                 std::vector<module_ref>& mod_args)
+shape mod_compute_shape_op(const T& x,
+                           const std::vector<shape>& inputs,
+                           const std::vector<module_ref>& mod_args)
 {
-    return normalize_compute_shape_op(rank<1>{}, x, inputs, mod_args);
+    return mod_compute_shape_op(rank<1>{}, x, inputs, mod_args);
 }

 template <class T>
@@ -256,6 +246,18 @@ argument compute_op(const T& x,
    return compute_op(rank<1>{}, x, output, inputs, module_args, f);
 }

+template <class T, class F>
+auto compute_op(rank<4>,
+                const T& x,
+                context& ctx,
+                const shape& output,
+                const std::vector<argument>& inputs,
+                const std::vector<module_ref>& module_args,
+                F f) -> decltype(x.compute(auto_any_cast(ctx), output, inputs, module_args, f))
+{
+    return x.compute(auto_any_cast(ctx), output, inputs, module_args, f);
+}
+
 template <class T, class F>
 auto compute_op(rank<3>,
                const T& x,
@@ -313,7 +315,7 @@ argument compute_op(const T& x,
                    const std::vector<module_ref>& module_args,
                    F f)
 {
-    return compute_op(rank<3>{}, x, ctx, output, inputs, module_args, f);
+    return compute_op(rank<4>{}, x, ctx, output, inputs, module_args, f);
 }

 template <class T>
@@ -443,35 +445,62 @@ lifetime get_lifetime_op(const T&)

 } // namespace detail

-/*
- * Type-erased interface for:
- *
- * struct operation
- * {
- *      std::string name() const;
- *      bool is_context_free() const;
- *      bool need_normalization() const;
- *      bool has_finalize() const;
- *      lifetime get_lifetime() const;
- *      std::ptrdiff_t output_alias(const std::vector<shape>& input) const;
- *      value compile(context& ctx,const shape& output,const std::vector<shape>& input) ;
- *      void finalize(context& ctx,const shape& output,const std::vector<shape>& input) ;
- *      shape compute_shape(const std::vector<shape>& input) const;
- *      shape compute_shape(const std::vector<shape>& inputs,const std::vector<module_ref>&
- * mod_args) const; argument compute(context& ctx,const shape& output,const std::vector<argument>&
- * input) const; argument compute(const shape& output,const std::vector<argument>& input)
- * const; argument compute(const shape& output,const std::vector<argument>& input,const
- * std::vector<module_ref>& module_args,std::function<std::vector<argument>(module_ref&, const
- * std::unordered_map<std::string, argument>&)> run) const; argument compute(context& ctx,const
- * shape& output,const std::vector<argument>& input,const std::vector<module_ref>&
- * module_args,std::function<std::vector<argument>(module_ref&, const
- * std::unordered_map<std::string, argument>&)> run) const; value to_value() const; void
- * from_value(const value& v) ; value attributes() const; friend std::ostream &
- * operator<<(std::ostream & os,const operation & op) ; friend bool operator==(const operation &
- * x,const operation & y) ;
- * };
- *
- */
+#ifdef TYPE_ERASED_DECLARATION
+
+// Type-erased interface for:
+struct operation
+{
+    //
+    std::string name() const;
+    // (optional)
+    bool is_context_free() const;
+    // (optional)
+    bool need_normalization() const;
+    // (optional)
+    bool has_finalize() const;
+    // (optional)
+    lifetime get_lifetime() const;
+    // (optional)
+    std::ptrdiff_t output_alias(const std::vector<shape>& input) const;
+    // (optional)
+    value compile(context& ctx, const shape& output, const std::vector<shape>& input);
+    // (optional)
+    void finalize(context& ctx, const shape& output, const std::vector<shape>& input);
+    // (optional)
+    shape compute_shape(const std::vector<shape>& input) const;
+    // (optional)
+    shape compute_shape(const std::vector<shape>& inputs,
+                        const std::vector<module_ref>& mod_args) const;
+    // (optional)
+    argument compute(context& ctx, const shape& output, const std::vector<argument>& input) const;
+    // (optional)
+    argument compute(const shape& output, const std::vector<argument>& input) const;
+    // (optional)
+    argument compute(const shape& output,
+                     const std::vector<argument>& input,
+                     const std::vector<module_ref>& module_args,
+                     std::function<std::vector<argument>(
+                         module_ref&, const std::unordered_map<std::string, argument>&)> run) const;
+    // (optional)
+    argument compute(context& ctx,
+                     const shape& output,
+                     const std::vector<argument>& input,
+                     const std::vector<module_ref>& module_args,
+                     std::function<std::vector<argument>(
+                         module_ref&, const std::unordered_map<std::string, argument>&)> run) const;
+    // (optional)
+    value to_value() const;
+    // (optional)
+    void from_value(const value& v);
+    // (optional)
+    value attributes() const;
+    //
+    friend std::ostream& operator<<(std::ostream& os, const operation& op);
+    //
+    friend bool operator==(const operation& x, const operation& y);
+};
+
+#else

 struct operation
 {
@@ -836,7 +865,7 @@ struct operation
                                                         T&& private_detail_te_self,
                                                         const std::vector<shape>& input)
    {
-        return detail::normalize_compute_shape_op(private_detail_te_self, input);
+        return detail::compute_shape_op(private_detail_te_self, input);
    }

    template <class T>
@@ -855,7 +884,7 @@ struct operation
                                                         const std::vector<shape>& inputs,
                                                         const std::vector<module_ref>& mod_args)
    {
-        return detail::compute_shape_op(private_detail_te_self, inputs, mod_args);
+        return detail::mod_compute_shape_op(private_detail_te_self, inputs, mod_args);
    }

    template <class T>
@@ -1220,6 +1249,7 @@ inline const ValueType& any_cast(const operation& x)
        throw std::bad_cast();
    return *y;
 }
+#endif

 inline bool operator!=(const operation& x, const operation& y) { return !(x == y); }

@@ -1257,7 +1287,7 @@ template <class T>
 inline auto compute_shape(const T& op, const std::vector<shape>& inputs)
    -> decltype(op.normalize_compute_shape(inputs))
 {
-    return detail::normalize_compute_shape_op(op, inputs);
+    return detail::compute_shape_op(op, inputs);
 }

 inline shape compute_shape(const operation& op,
@@ -1282,7 +1312,7 @@ inline auto compute_shape(const T& op,
                          const std::vector<module_ref>& mod_args)
    -> decltype(op.normalize_compute_shape(inputs, mod_args))
 {
-    return detail::normalize_compute_shape_op(op, inputs, mod_args);
+    return detail::compute_shape_op(op, inputs, mod_args);
 }

 inline bool is_context_free(const operation& op) { return op.is_context_free(); }