Improve operators for onnxruntime (#405)

* improve unsqueeze to support negative axis and parsing scalar * clang format * add a test example for the negative axis of unsqueeze * improve the squeeze operator to support negative axis * clang format * fixed a small bug in the lrn implementation * clang format * support negative axis in argmax and argmin * clang format * improve flatten to support negative axis * clang format * change softmax/logsoftmax to support negative axis * clang format * improve transpose by adding default perm * clang format * add one more dimens for tensor size * add one more dimens for tensor size * disable conv ops fusion for non-symmetric cases * clang format * fixed review comments * move computing axis from the device function to the compute function * clang format * move computing axis from device function to the operator computing function * clang format Co-authored-by: mvermeulen <5479696+mvermeulen@users.noreply.github.com>

Improve operators for onnxruntime (#405)
* improve unsqueeze to support negative axis and parsing scalar * clang format * add a test example for the negative axis of unsqueeze * improve the squeeze operator to support negative axis * clang format * fixed a small bug in the lrn implementation * clang format * support negative axis in argmax and argmin * clang format * improve flatten to support negative axis * clang format * change softmax/logsoftmax to support negative axis * clang format * improve transpose by adding default perm * clang format * add one more dimens for tensor size * add one more dimens for tensor size * disable conv ops fusion for non-symmetric cases * clang format * fixed review comments * move computing axis from the device function to the compute function * clang format * move computing axis from device function to the operator computing function * clang format Co-authored-by: mvermeulen <5479696+mvermeulen@users.noreply.github.com>
992666e6 · Shucai Xiao · mvermeulen · 2ee0f9e8 · 992666e6 · 992666e6
Commit 992666e6 authored Dec 20, 2019 by Shucai Xiao Committed by mvermeulen Dec 20, 2019
20 changed files
--- a/src/include/migraphx/op/argmax.hpp
+++ b/src/include/migraphx/op/argmax.hpp
@@ -27,24 +27,29 @@ struct argmax
        check_shapes{inputs, *this}.has(1).standard();
        auto lens     = inputs[0].lens();
        int64_t n_dim = static_cast<int64_t>(lens.size());
-        if(axis >= n_dim || axis < 0)
+        if(axis >= n_dim || axis < -n_dim)
        {
            MIGRAPHX_THROW("ARGMAX: axis is out of range.");
        }
-        lens[axis] = 1;
+        int64_t tuned_axis = (axis < 0) ? axis + n_dim : axis;
+        lens[tuned_axis] = 1;
        return {shape::int64_type, lens};
    }
    template <class T>
-    int64_t calc_argmax(T& input, std::vector<std::size_t>& indices, size_t item_num) const
+    int64_t calc_argmax(T& input,
+                        int64_t tuned_axis,
+                        std::vector<std::size_t>& indices,
+                        size_t item_num) const
    {
        auto max_val      = input(indices.begin(), indices.end());
        int64_t max_index = 0;
        for(std::size_t i = 1; i < item_num; ++i)
        {
-            indices[axis] = i;
+            indices[tuned_axis] = i;
            auto cur_val        = input(indices.begin(), indices.end());
            if(max_val < cur_val)
            {
@@ -59,13 +64,15 @@ struct argmax
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        auto batch_item_num = args.front().get_shape().lens()[axis];
+        auto n_dim          = args.front().get_shape().lens().size();
+        auto tuned_axis     = axis < 0 ? axis + n_dim : axis;
+        auto batch_item_num = args.front().get_shape().lens()[tuned_axis];
        result.visit([&](auto output) {
            args[0].visit([&](auto input) {
                par_for(output_shape.elements(), [&](auto i) {
                    auto data_idx = output_shape.multi(i);
-                    output[i]     = this->calc_argmax(input, data_idx, batch_item_num);
+                    output[i]     = this->calc_argmax(input, tuned_axis, data_idx, batch_item_num);
                });
            });
        });

--- a/src/include/migraphx/op/argmin.hpp
+++ b/src/include/migraphx/op/argmin.hpp
@@ -27,24 +27,28 @@ struct argmin
        check_shapes{inputs, *this}.has(1).standard();
        auto lens     = inputs[0].lens();
        int64_t n_dim = static_cast<int64_t>(lens.size());
-        if(axis >= n_dim || axis < 0)
+        if(axis >= n_dim || axis < -n_dim)
        {
            MIGRAPHX_THROW("ARGMIN: axis is out of range.");
        }
-        lens[axis] = 1;
+        int64_t tuned_axis = (axis < 0) ? axis + n_dim : axis;
+        lens[tuned_axis]   = 1;
        return {shape::int64_type, lens};
    }
    template <class T>
-    int64_t calc_argmin(T& input, std::vector<std::size_t>& indices, size_t item_num) const
+    int64_t calc_argmin(T& input,
+                        int64_t tuned_axis,
+                        std::vector<std::size_t>& indices,
+                        size_t item_num) const
    {
        auto min_val      = input(indices.begin(), indices.end());
        int64_t min_index = 0;
        for(std::size_t i = 1; i < item_num; ++i)
        {
-            indices[axis] = i;
+            indices[tuned_axis] = i;
            auto cur_val        = input(indices.begin(), indices.end());
            if(min_val > cur_val)
            {
@@ -59,13 +63,15 @@ struct argmin
    argument compute(const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        std::size_t batch_item_num = args.front().get_shape().lens()[axis];
+        auto n_dim                 = args.front().get_shape().lens().size();
+        auto tuned_axis            = axis < 0 ? axis + n_dim : axis;
+        std::size_t batch_item_num = args.front().get_shape().lens()[tuned_axis];
        result.visit([&](auto output) {
            args[0].visit([&](auto input) {
                par_for(output_shape.elements(), [&](auto i) {
                    auto data_idx = output_shape.multi(i);
-                    output[i]     = this->calc_argmin(input, data_idx, batch_item_num);
+                    output[i]     = this->calc_argmin(input, tuned_axis, data_idx, batch_item_num);
                });
            });
        });

--- a/src/include/migraphx/op/flatten.hpp
+++ b/src/include/migraphx/op/flatten.hpp
@@ -18,7 +18,7 @@ namespace op {
 struct flatten
 {
-    uint64_t axis = 0;
+    int64_t axis = 1;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -31,15 +31,18 @@ struct flatten
    {
        check_shapes{inputs}.has(1);
        auto&& lens   = inputs.front().lens();
+        int64_t n_dim = static_cast<int64_t>(lens.size());
-        if(axis > lens.size())
+        if(axis > n_dim or axis < -n_dim)
        {
-            MIGRAPHX_THROW("axis for flatten must be less than tensor rank");
+            MIGRAPHX_THROW("FLATTEN: axis for flatten is out of range");
        }
-        auto x =
-            std::accumulate(lens.begin(), lens.begin() + axis, std::size_t{1}, std::multiplies<>{});
+        auto tuned_axis = (axis < 0) ? axis + n_dim : axis;
-        auto y =
-            std::accumulate(lens.begin() + axis, lens.end(), std::size_t{1}, std::multiplies<>{});
+        auto x = std::accumulate(
+            lens.begin(), lens.begin() + tuned_axis, std::size_t{1}, std::multiplies<>{});
+        auto y = std::accumulate(
+            lens.begin() + tuned_axis, lens.end(), std::size_t{1}, std::multiplies<>{});
        return {inputs.at(0).type(), {x, y}};
    }
    argument compute(shape output_shape, std::vector<argument> args) const

--- a/src/include/migraphx/op/logsoftmax.hpp
+++ b/src/include/migraphx/op/logsoftmax.hpp
@@ -11,7 +11,7 @@ namespace op {
 struct logsoftmax
 {
-    int axis = 1;
+    int64_t axis = 1;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -23,7 +23,8 @@ struct logsoftmax
    shape compute_shape(std::vector<shape> inputs) const
    {
        check_shapes{inputs}.has(1).standard();
-        if(axis < 0 || axis >= inputs[0].lens().size())
+        int64_t n_dim = static_cast<int64_t>(inputs[0].lens().size());
+        if(axis < -n_dim || axis >= n_dim)
        {
            MIGRAPHX_THROW("LogSoftMax: input axis value " + std::to_string(axis) +
                           " is out of range");

--- a/src/include/migraphx/op/softmax.hpp
+++ b/src/include/migraphx/op/softmax.hpp
@@ -11,7 +11,7 @@ namespace op {
 struct softmax
 {
-    int axis = 1;
+    int64_t axis = 1;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -23,7 +23,8 @@ struct softmax
    shape compute_shape(std::vector<shape> inputs) const
    {
        check_shapes{inputs}.has(1).standard();
-        if(axis < 0 || axis >= inputs[0].lens().size())
+        int64_t n_dim = inputs[0].lens().size();
+        if(axis < -n_dim || axis >= n_dim)
        {
            MIGRAPHX_THROW("SoftMax: input axis value " + std::to_string(axis) +
                           " is out of range");

--- a/src/include/migraphx/op/squeeze.hpp
+++ b/src/include/migraphx/op/squeeze.hpp
@@ -33,13 +33,21 @@ struct squeeze
        auto input_shape = inputs[0];
        auto type        = input_shape.type();
        auto old_lens    = input_shape.lens();
-        if(std::any_of(
-               axes.begin(), axes.end(), [&](auto axis) { return input_shape.lens()[axis] != 1; }))
+        // change to support negative axis value
+        std::vector<int64_t> tuned_axes(axes.size());
+        std::transform(axes.begin(), axes.end(), tuned_axes.begin(), [&](auto i) {
+            return i >= 0 ? i : i + old_lens.size();
+        });
+        if(std::any_of(tuned_axes.begin(), tuned_axes.end(), [&](auto axis) {
+               return old_lens[axis] != 1;
+           }))
        {
            MIGRAPHX_THROW("squeeze axis dimension should be equal to 1");
        }
        std::vector<std::size_t> new_lens;
-        if(axes.empty())
+        if(tuned_axes.empty())
        {
            std::copy_if(old_lens.begin(),
                         old_lens.end(),
@@ -50,7 +58,7 @@ struct squeeze
        {
            for(std::size_t i = 0; i < old_lens.size(); i++)
            {
-                if(std::find(axes.begin(), axes.end(), i) == axes.end())
+                if(std::find(tuned_axes.begin(), tuned_axes.end(), i) == tuned_axes.end())
                {
                    new_lens.push_back(old_lens[i]);
                }

--- a/src/include/migraphx/op/transpose.hpp
+++ b/src/include/migraphx/op/transpose.hpp
@@ -34,13 +34,22 @@ struct transpose
        auto input_lens    = input.lens();
        auto input_strides = input.strides();
        auto t             = input.type();
-        if(dims.size() != input_lens.size())
+        auto tuned_dims    = dims;
+        // if not perm provided, reverse the dims
+        if(tuned_dims.empty())
+        {
+            tuned_dims.resize(input_lens.size());
+            std::iota(tuned_dims.begin(), tuned_dims.end(), 0);
+            std::reverse(tuned_dims.begin(), tuned_dims.end());
+        }
+        if(tuned_dims.size() != input_lens.size())
        {
            MIGRAPHX_THROW("Permutation has wrong number of axes");
        }
-        std::vector<int64_t> axes(dims.size());
+        std::vector<int64_t> axes(tuned_dims.size());
        std::iota(axes.begin(), axes.end(), 0);
-        if(!std::is_permutation(axes.begin(), axes.end(), dims.begin()))
+        if(!std::is_permutation(axes.begin(), axes.end(), tuned_dims.begin()))
        {
            MIGRAPHX_THROW("Invalid permutation");
        }
@@ -48,8 +57,8 @@ struct transpose
        std::vector<size_t> output_strides(input_lens.size());
        for(std::size_t i = 0; i < output_lens.size(); i++)
        {
-            output_lens[i]    = input_lens[dims[i]];
+            output_lens[i]    = input_lens[tuned_dims[i]];
-            output_strides[i] = input_strides[dims[i]];
+            output_strides[i] = input_strides[tuned_dims[i]];
        }
        return {t, output_lens, output_strides};
    }

--- a/src/include/migraphx/op/unsqueeze.hpp
+++ b/src/include/migraphx/op/unsqueeze.hpp
@@ -38,11 +38,18 @@ struct unsqueeze
            return shape{type, old_lens};
        std::size_t new_size = old_lens.size() + axes.size();
+        // in case of axes to be negative, tune to positive
+        std::vector<int64_t> tuned_axes(axes.size());
+        std::transform(axes.begin(), axes.end(), tuned_axes.begin(), [new_size](auto i) {
+            return i >= 0 ? i : i + new_size;
+        });
        std::vector<std::size_t> new_lens(new_size);
        std::size_t p = 0;
        for(std::size_t i = 0; i < new_size; i++)
        {
-            if(std::find(axes.begin(), axes.end(), i) != axes.end())
+            if(std::find(tuned_axes.begin(), tuned_axes.end(), i) != tuned_axes.end())
            {
                new_lens[i] = 1;
            }

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -231,8 +231,15 @@ struct onnx_parser
            auto s0       = arg0->get_shape().lens();
            auto s1       = arg1->get_shape().lens();
            auto out_lens = compute_broadcasted_lens(s0, s1);
-            auto l0       = prog.add_instruction(op::multibroadcast{out_lens}, arg0);
-            auto l1       = prog.add_instruction(op::multibroadcast{out_lens}, arg1);
+            auto l0 = arg0;
+            if(arg0->get_shape().lens() != out_lens)
+                l0 = prog.add_instruction(op::multibroadcast{out_lens}, arg0);
+            auto l1 = arg1;
+            if(arg1->get_shape().lens() != out_lens)
+                l1 = prog.add_instruction(op::multibroadcast{out_lens}, arg1);
            return prog.add_instruction(x, l0, l1);
        }
        else
@@ -283,7 +290,7 @@ struct onnx_parser
                                  const attribute_map& attributes,
                                  std::vector<instruction_ref> args)
    {
-        int axis = 1;
+        int64_t axis = 1;
        if(contains(attributes, "axis"))
        {
            axis = parse_value(attributes.at("axis")).at<int>();
@@ -463,7 +470,7 @@ struct onnx_parser
    instruction_ref
    parse_flatten(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
-        uint64_t axis = 1;
+        int64_t axis = 1;
        if(contains(attributes, "axis"))
        {
            axis = parse_value(attributes.at("axis")).at<int>();
@@ -1696,6 +1703,9 @@ struct onnx_parser
                           }
                           return batch_size;
                       });
+        if(dims.empty())
+            return {shape_type};
        return {shape_type, dims};
    }

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -144,13 +144,14 @@ struct cpu_lrn
            int height          = output_shape.lens()[2];
            int width           = output_shape.lens()[3];
            float alphaoverarea = op.alpha / float(op.size);
-            int radius          = (op.size - 1) / 2;
+            int radius_lower    = (op.size - 1) / 2;
+            int radius_upper    = op.size / 2 + 1;
            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
                float scale = 0;
                dfor(channels)([&](int c) {
-                    auto start = (c - radius) < 0 ? 0 : (c - radius);
+                    auto start = (c - radius_lower) < 0 ? 0 : (c - radius_lower);
-                    auto end   = (c + radius) > channels ? channels : (c + radius);
+                    auto end   = (c + radius_upper) > channels ? channels : (c + radius_upper);
                    for(auto k = start; k < end; ++k)
                    {
                        scale += std::pow(input(b, k, h, w), 2);
@@ -599,8 +600,9 @@ struct cpu_softmax
    {
        argument result{output_shape};
        auto batch_lens    = output_shape.lens();
-        std::size_t n_dims  = batch_lens[op.axis];
+        int64_t tuned_axis = (op.axis < 0) ? op.axis + args[0].get_shape().lens().size() : op.axis;
-        batch_lens[op.axis] = 1;
+        std::size_t n_dims = batch_lens[tuned_axis];
+        batch_lens[tuned_axis] = 1;
        shape batch_shape{shape::int32_type, batch_lens};
        visit_all(result, args[0])([&](auto output, auto input) {
@@ -612,26 +614,26 @@ struct cpu_softmax
                auto idx = batch_shape.multi(i);
                for(std::size_t j = 0; j < n_dims; ++j)
                {
-                    idx[op.axis] = j;
+                    idx[tuned_axis] = j;
                    batch_max[i]    = std::max(batch_max[i], input(idx.begin(), idx.end()));
                }
                for(std::size_t j = 0; j < n_dims; ++j)
                {
-                    idx[op.axis]      = j;
+                    idx[tuned_axis]   = j;
                    std::size_t index = output_shape.index(idx);
                    output[index]     = std::exp(input[index] - batch_max[i]);
                }
                for(std::size_t j = 0; j < n_dims; ++j)
                {
-                    idx[op.axis] = j;
+                    idx[tuned_axis] = j;
                    batch_sum[i] += output(idx.begin(), idx.end());
                }
                for(std::size_t j = 0; j < n_dims; ++j)
                {
-                    idx[op.axis] = j;
+                    idx[tuned_axis] = j;
                    output(idx.begin(), idx.end()) =
                        op.output()(output(idx.begin(), idx.end()), batch_sum[i]);
                }

--- a/src/targets/gpu/argmax.cpp
+++ b/src/targets/gpu/argmax.cpp
@@ -14,7 +14,9 @@ shape hip_argmax::compute_shape(const std::vector<shape>& inputs) const
 argument hip_argmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    device::argmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    auto n_dim         = args.front().get_shape().lens().size();
+    int64_t tuned_axis = (op.axis < 0) ? op.axis + n_dim : op.axis;
+    device::argmax(ctx.get_stream().get(), args.back(), args.front(), tuned_axis);
    return args.back();
 }

--- a/src/targets/gpu/argmin.cpp
+++ b/src/targets/gpu/argmin.cpp
@@ -14,7 +14,9 @@ shape hip_argmin::compute_shape(const std::vector<shape>& inputs) const
 argument hip_argmin::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    device::argmin(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    auto n_dim         = args.front().get_shape().lens().size();
+    int64_t tuned_axis = (op.axis < 0) ? op.axis + n_dim : op.axis;
+    device::argmin(ctx.get_stream().get(), args.back(), args.front(), tuned_axis);
    return args.back();
 }

--- a/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/visit.hpp
@@ -39,7 +39,12 @@ constexpr void visit_tensor_size(index_int n, F f)
        f(std::integral_constant<index_int, 5>{});
        break;
    }
-    default: throw std::runtime_error("Unknown tensor size");
+    case 6:
+    {
+        f(std::integral_constant<index_int, 6>{});
+        break;
+    }
+    default: throw std::runtime_error("Tensor size dim out of range");
    }
 }

--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
@@ -11,11 +11,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
 {
-    auto lens                = result.get_shape().lens();
+    auto batch_lens          = result.get_shape().lens();
-    auto batch_lens          = lens;
+    index_int batch_item_num = batch_lens[axis];
-    index_int batch_item_num = lens[axis];
    batch_lens[axis]         = 1;
    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};

--- a/src/targets/gpu/device/softmax.cpp
+++ b/src/targets/gpu/device/softmax.cpp
@@ -12,11 +12,10 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis)
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
 {
-    auto lens                = result.get_shape().lens();
+    auto batch_lens          = result.get_shape().lens();
-    auto batch_lens          = lens;
+    index_int batch_item_num = batch_lens[axis];
-    index_int batch_item_num = lens[axis];
    batch_lens[axis]         = 1;
    migraphx::shape batch_shape{result.get_shape().type(), batch_lens};

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -148,6 +148,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
        return false;
    if(wei.lens()[1] > 512 and conv.algo != miopenConvolutionFwdAlgoWinograd)
        return false;
+    // Do not fuse non-symmetric input
+    auto input_lens = ins->inputs().at(0)->get_shape().lens();
+    if(input_lens[2] != input_lens[3] or wei.lens()[2] != wei.lens()[3])
+        return false;
    auto op = conv.op;
    // Dont fuse winograd for non-3x3s since there is no fused windograd for those configs
    if(conv.algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and

--- a/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/arg_op.hpp
@@ -72,9 +72,8 @@ template <class Op>
 void arg_op(Op op, hipStream_t stream, const argument& result, const argument& arg, int64_t axis)
 {
    auto arg_shape        = arg.get_shape();
-    auto lens             = arg_shape.lens();
+    auto batch_lens       = arg_shape.lens();
-    auto batch_lens       = lens;
+    size_t batch_item_num = batch_lens[axis];
-    size_t batch_item_num = lens[axis];
    batch_lens[axis]      = 1;
    migraphx::shape batch_shape{arg_shape.type(), batch_lens};

--- a/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/logsoftmax.hpp
@@ -10,7 +10,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int axis);
+void logsoftmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/softmax.hpp
@@ -10,7 +10,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
-void softmax(hipStream_t stream, const argument& result, const argument& arg, int axis);
+void softmax(hipStream_t stream, const argument& result, const argument& arg, int64_t axis);
 } // namespace device
 } // namespace gpu

--- a/src/targets/gpu/logsoftmax.cpp
+++ b/src/targets/gpu/logsoftmax.cpp
@@ -18,7 +18,9 @@ shape hip_logsoftmax::compute_shape(const std::vector<shape>& inputs) const
 argument
 hip_logsoftmax::compute(context& ctx, const shape&, const std::vector<argument>& args) const
 {
-    device::logsoftmax(ctx.get_stream().get(), args.back(), args.front(), op.axis);
+    auto n_dim      = args.front().get_shape().lens().size();
+    auto tuned_axis = (op.axis < 0) ? op.axis + n_dim : op.axis;
+    device::logsoftmax(ctx.get_stream().get(), args.back(), args.front(), tuned_axis);
    return args.back();
 }