Merge remote-tracking branch 'origin/develop' into bert-attention

dc18031e · turneram · 375c7b8d · d436a723 · dc18031e · dc18031e
Commit dc18031e authored May 27, 2022 by turneram
20 changed files
--- a/src/targets/cpu/gather.cpp
+++ b/src/targets/cpu/gather.cpp
@@ -26,7 +26,6 @@ struct cpu_gather : auto_register_op<cpu_gather>
    }
    argument
-    // cppcheck-suppress constParameter
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        std::size_t nelements = output_shape.elements();

--- a/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/pointwise.hpp
@@ -323,7 +323,6 @@ struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
        return {s.type(), s.lens()};
    }
    argument
-    // cppcheck-suppress constParameter
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        argument result = get_arg(args, args.size() - 1);
@@ -362,7 +361,6 @@ struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>>
    }
    argument
-    // cppcheck-suppress constParameter
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
    {
        argument result = get_arg(args, args.size() - 1);

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
@@ -134,7 +134,6 @@ struct hiprtc_program
        std::vector<char> buffer(n);
        MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
        assert(buffer.back() == 0);
-        // cppcheck-suppress returnDanglingLifetime
        return {buffer.begin(), buffer.end() - 1};
    }

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -681,7 +681,7 @@ struct miopen_fusion
 struct miopen_conv_bias
 {
    op::convolution op;
-    fusion f          = {};
+    fusion fp         = {};
    fusion::op_t conv = {};
    fusion::op_t bias = {};
@@ -705,19 +705,19 @@ struct miopen_conv_bias
        float beta  = 0;
        miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
        miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
-        return f.execute(ctx, fargs, args[0], args[4]);
+        return fp.execute(ctx, fargs, args[0], args[4]);
    }
    void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
    {
-        f    = fusion(inputs[0]);
+        fp   = fusion(inputs[0]);
-        conv = f.create_conv(op, inputs[1]);
+        conv = fp.create_conv(op, inputs[1]);
-        bias = f.create_bias(inputs[3]);
+        bias = fp.create_bias(inputs[3]);
-        if(not f.compile(ctx))
+        if(not fp.compile(ctx))
            MIGRAPHX_THROW("Failed to compile fusion plan");
    }
-    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
+    shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
@@ -728,7 +728,7 @@ MIGRAPHX_REGISTER_OP(miopen_conv_bias)
 struct miopen_conv_bias_relu
 {
    op::convolution op;
-    fusion f          = {};
+    fusion fp         = {};
    fusion::op_t conv = {};
    fusion::op_t bias = {};
    fusion::op_t relu = {};
@@ -754,18 +754,18 @@ struct miopen_conv_bias_relu
        miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
        miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
        miopenSetOpArgsActivForward(fargs.get(), relu, &alpha, &beta, 0, 0, 0);
-        return f.execute(ctx, fargs, args[0], args[4]);
+        return fp.execute(ctx, fargs, args[0], args[4]);
    }
    void finalize(context& ctx, const shape&, const std::vector<shape>& inputs)
    {
-        f    = fusion(inputs[0]);
+        fp   = fusion(inputs[0]);
-        conv = f.create_conv(op, inputs[1]);
+        conv = fp.create_conv(op, inputs[1]);
-        bias = f.create_bias(inputs[3]);
+        bias = fp.create_bias(inputs[3]);
-        relu = f.create_relu();
+        relu = fp.create_relu();
-        f.compile(ctx);
+        fp.compile(ctx);
    }
-    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
+    shape get_workspace(context& ctx) { return fp.get_workspace(ctx); }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
@@ -875,7 +875,6 @@ struct find_conv_pointwise
        {
            if(i.name()[0] == '@')
                continue;
-            auto inputs = to_shapes(i.inputs());
            op.ops.push_back({{i.get_operator()}});
        }
        std::vector<instruction_ref> inputs = {input_ins, weights_ins, bias_ins, alloc_ins};
@@ -908,11 +907,6 @@ struct find_gemm_add
        if(not float_equal(gemm.beta, 0))
            return;
-        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto i) {
-               return not i->get_shape().standard();
-           }))
-            return;
        auto inputs = gemm_ins->inputs();
        inputs.pop_back();
@@ -931,6 +925,53 @@ struct find_gemm_add
    }
 };
+auto pointwise_name(const std::string& s)
+{
+    return precompile_name("pointwise")(match::make_basic_pred_matcher([=](auto ins) {
+        module_ref pm = ins->module_inputs().front();
+        auto n = std::count_if(pm->begin(), pm->end(), [&](auto& i) { return i.name() == s; });
+        if(n != 1)
+            return false;
+        return std::all_of(pm->begin(), pm->end(), [&](auto& i) {
+            return starts_with(i.name(), "@") or i.name() == s;
+        });
+    }));
+}
+struct find_gemm_pointwise
+{
+    auto matcher() const
+    {
+        return pointwise_name("add")(
+            match::nargs(3),
+            match::all_of[match::inputs()](match::standard_shape()),
+            match::either_arg(0, 1)(match::used_once().bind("c"),
+                                    match::name("gpu::gemm")(match::nargs(3)).bind("gemm")));
+    }
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto gemm_ins = r.instructions["gemm"];
+        auto c_ins    = r.instructions["c"];
+        auto gemm = any_cast<rocblas_gemm<op::dot>>(gemm_ins->get_operator());
+        // Already fused gemm
+        if(not float_equal(gemm.beta, 0))
+            return;
+        auto inputs = gemm_ins->inputs();
+        inputs.pop_back();
+        inputs.push_back(c_ins);
+        inputs.push_back(gemm_ins->inputs().back());
+        gemm.beta = 1;
+        m.replace_instruction(ins, gemm, inputs);
+    }
+};
 struct find_commutative_broadcast
 {
    auto matcher() const
@@ -967,7 +1008,11 @@ void fuse_ops::apply(module& m) const
                        find_add_unary{"gpu::tanh", hip_add_tanh{}, hip_triadd_tanh{}},
                        find_add_clip{});
    run_passes(m, {dead_code_elimination{}});
-    match::find_matches(m, find_triadd_layernorm{}, find_gemm_add{}, find_commutative_broadcast{});
+    match::find_matches(m,
+                        find_triadd_layernorm{},
+                        find_gemm_add{},
+                        find_gemm_pointwise{},
+                        find_commutative_broadcast{});
 }
 } // namespace gpu

--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
 #include <rocblas.h>
 #include <migraphx/gpu/gemm_impl.hpp>
+#include <migraphx/reduce_dims.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -27,6 +28,22 @@ rocblas_datatype get_type(shape::type_t type)
    MIGRAPHX_THROW("ROCBLAS_GEMM: data type not supported!");
 }
+void blas_shape(const shape& s)
+{
+    if(s.lens().size() < 2)
+        return;
+    if(std::none_of(s.strides().end() - 2, s.strides().end(), [&](auto i) { return i == 1; }))
+        MIGRAPHX_THROW("GPU_GEMM: needs to have one matrix stride as 1");
+    if(s.lens().size() < 3)
+        return;
+    shape batch_shape{s.type(),
+                      {s.lens().begin(), s.lens().end() - 2},
+                      {s.strides().begin(), s.strides().end() - 2}};
+    auto batch_shapes = reduce_dims({batch_shape});
+    if(batch_shapes.front().lens().size() != 1)
+        MIGRAPHX_THROW("GPU_GEMM: Batch dimension is not collapsible");
+}
 template <class R, class... Ts, class... Us>
 R rocblas_invoke(R (*f)(Ts...), Us... xs)
 {
@@ -36,6 +53,18 @@ R rocblas_invoke(R (*f)(Ts...), Us... xs)
        return f(xs..., nullptr, nullptr);
 }
+static bool is_transposed(const shape& s)
+{
+    if(not s.transposed())
+        return false;
+    return s.strides().back() != 1;
+}
+static rocblas_int get_batch_stride(const argument& a)
+{
+    return a.get_shape().strides()[a.get_shape().strides().size() - 3];
+}
 template <class T>
 void gemm_impl(context& ctx,
               const shape& output_shape,
@@ -45,8 +74,8 @@ void gemm_impl(context& ctx,
               bool int8_x4_format,
               bool compute_fp32)
 {
-    bool transa     = args[0].get_shape().transposed();
+    bool transa     = is_transposed(args[0].get_shape());
-    bool transb     = args[1].get_shape().transposed();
+    bool transb     = is_transposed(args[1].get_shape());
    auto n_dim      = output_shape.lens().size();
    auto dim_1      = n_dim - 1;
    auto dim_0      = n_dim - 2;
@@ -142,6 +171,9 @@ void gemm_impl(context& ctx,
        }
        else
        {
+            auto a_stride = get_batch_stride(args[0]);
+            auto b_stride = get_batch_stride(args[1]);
+            auto c_stride = get_batch_stride(args[2]);
            rocblas_invoke(&rocblas_gemm_strided_batched_ex,
                           ctx.get_stream().get_rocblas(),
                           transb ? rocblas_operation_transpose : rocblas_operation_none,
@@ -153,20 +185,20 @@ void gemm_impl(context& ctx,
                           to_pointer(args.at(1)),
                           arg_type,
                           ldb,
-                           k * n,
+                           b_stride,
                           to_pointer(args.at(0)),
                           arg_type,
                           lda,
-                           m * k,
+                           a_stride,
                           beta_v,
                           to_pointer(args[2]),
                           output_type,
                           ldc,
-                           m * n,
+                           c_stride,
                           is_3inputs ? to_pointer(args[3]) : to_pointer(args[2]),
                           output_type,
                           ldc,
-                           m * n,
+                           c_stride,
                           num_matrices,
                           compute_type,
                           rocblas_gemm_algo_standard,

--- a/src/targets/gpu/include/migraphx/gpu/gemm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gemm.hpp
@@ -18,6 +18,8 @@ namespace gpu {
 struct context;
+void blas_shape(const shape& s);
 template <class Op>
 struct rocblas_gemm
 {
@@ -50,13 +52,14 @@ struct rocblas_gemm
        std::vector<shape> in_shapes(inputs);
        in_shapes.pop_back();
        check_shapes{in_shapes, *this}.not_broadcasted();
-        batch_not_transposed(inputs[0].strides());
+        blas_shape(inputs[0]);
-        batch_not_transposed(inputs[1].strides());
+        blas_shape(inputs[1]);
        // if gemm and add are fused
-        if(not float_equal(beta, 0))
+        if(in_shapes.size() > 2)
        {
            auto cmat_shape = in_shapes.back();
            in_shapes.pop_back();
+            blas_shape(cmat_shape);
            auto op_out_shape = op.compute_shape(in_shapes);
            if(cmat_shape.lens() != op_out_shape.lens())
            {
@@ -71,6 +74,7 @@ struct rocblas_gemm
                               to_string(cmat_shape.type()) +
                               ", it must be: " + to_string(op_out_shape.type()));
            }
+            return op_out_shape;
        }
        return op.compute_shape(in_shapes);
@@ -96,28 +100,6 @@ struct rocblas_gemm
        return args.back();
    }
-    void batch_not_transposed(const std::vector<std::size_t>& strides) const
-    {
-        if(strides.size() <= 2)
-            return;
-        auto dim_0       = strides.size() - 2;
-        auto matrix_size = std::max(strides[dim_0], strides[dim_0 + 1]);
-        std::vector<std::size_t> batch(strides.begin(), strides.begin() + dim_0);
-        if(std::all_of(batch.begin(), batch.end(), [&](auto i) { return (i < matrix_size); }))
-        {
-            MIGRAPHX_THROW("GPU_GEMM: matrix size and batch size {" + to_string_range(strides) +
-                           "} are transposed!");
-        }
-        if(std::adjacent_find(batch.begin(), batch.end(), [&](auto i, auto j) {
-               return (i < j or i < matrix_size or j < matrix_size);
-           }) != batch.end())
-        {
-            MIGRAPHX_THROW("GPU_GEMM: batch size {" + to_string_range(strides) +
-                           "} is transposed!");
-        }
-    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;

--- a/src/targets/gpu/jit/gathernd.cpp
+++ b/src/targets/gpu/jit/gathernd.cpp
@@ -19,7 +19,7 @@ namespace gpu {
 // NOLINTNEXTLINE
 static const char* const gathernd_kernel = R"__migraphx__(
 #include <migraphx/kernels/gathernd.hpp>
-#include <migraphx/kernels/basic_ops.hpp>
+#include <migraphx/kernels/ops.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/generic_constant.hpp>
 #include <args.hpp>

--- a/src/targets/gpu/jit/roialign.cpp
+++ b/src/targets/gpu/jit/roialign.cpp
@@ -19,7 +19,6 @@ namespace gpu {
 // NOLINTNEXTLINE
 static const char* const roialign_kernel = R"__migraphx__(
 #include <migraphx/kernels/roialign.hpp>
-#include <migraphx/kernels/basic_ops.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/generic_constant.hpp>
 #include <args.hpp>

--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -19,7 +19,6 @@ namespace gpu {
 // NOLINTNEXTLINE
 static const char* const scatternd_kernel = R"__migraphx__(
 #include <migraphx/kernels/scatternd.hpp>
-#include <migraphx/kernels/basic_ops.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/generic_constant.hpp>
 #include <args.hpp>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -146,8 +146,8 @@ struct array
    constexpr array carry(array result) const
    {
-        uint32_t overflow = 0;
+        index_int overflow = 0;
-        for(std::ptrdiff_t i = result.size() - 1; i > 0; i--)
+        for(diff_int i = result.size() - 1; i > 0; i--)
        {
            auto z = result[i] + overflow;
            // Reset overflow

--- a/src/targets/gpu/kernels/include/migraphx/kernels/basic_ops.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/basic_ops.hpp
-#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
-#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
-#include <migraphx/kernels/types.hpp>
-namespace migraphx {
-struct sum
-{
-    template <class T, class U>
-    constexpr auto operator()(T x, U y) const
-    {
-        return x + y;
-    }
-};
-struct product
-{
-    template <class T, class U>
-    constexpr auto operator()(T x, U y) const
-    {
-        return x * y;
-    }
-};
-struct id
-{
-    template <class T>
-    constexpr auto operator()(T x) const
-    {
-        return x;
-    }
-};
-struct mean
-{
-    size_t item_num = 1;
-    template <class T>
-    constexpr auto operator()(T x) const
-    {
-        return x / static_cast<T>(item_num);
-    }
-};
-struct max_f
-{
-    template <class T, class U>
-    constexpr auto operator()(T x, U y) const
-    {
-        return (x > y) ? x : y;
-    }
-};
-inline constexpr auto max = max_f{};
-struct min_f
-{
-    template <class T, class U>
-    constexpr auto operator()(T x, U y) const
-    {
-        return (x < y) ? x : y;
-    }
-};
-inline constexpr auto min = min_f{};
-struct lowest
-{
-    template <class T>
-    constexpr operator T() const
-    {
-        return std::numeric_limits<T>::lowest();
-    }
-};
-struct highest
-{
-    template <class T>
-    constexpr operator T() const
-    {
-        return std::numeric_limits<T>::max();
-    }
-};
-} // namespace migraphx
-#endif // MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/functional.hpp
@@ -137,7 +137,7 @@ constexpr auto by(F f)
 template <class F, class... Ts>
 constexpr void each_args(F f, Ts&&... xs)
 {
-    swallow{(f(std::forward<Ts>(xs)), 0)...};
+    swallow{(f(static_cast<Ts&&>(xs)), 0)...};
 }
 template <class F>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/iota_iterator.hpp
@@ -13,7 +13,7 @@ struct basic_iota_iterator
    F f;
    using difference_type = diff_int;
-    using reference       = decltype(f(std::declval<Iterator>()));
+    using reference       = decltype(f(declval<Iterator>()));
    using value_type      = remove_reference_t<reference>;
    using pointer         = add_pointer_t<value_type>;

--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -3,14 +3,15 @@
 #include <migraphx/kernels/index.hpp>
 #include <migraphx/kernels/dfor.hpp>
-#include <migraphx/kernels/basic_ops.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/math.hpp>
 #include <migraphx/kernels/array.hpp>
 namespace migraphx {
 struct max_pool
 {
-    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return lowest(); }
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return lowest{}; }
    template <class T>
    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
@@ -55,7 +56,7 @@ MIGRAPHX_DEVICE_CONSTEXPR typename Iterator::value_type bilinear_interpolate(
            return 0;
        }
-        xy[ii]   = max(xy[ii], 0.0f);
+        xy[ii]   = migraphx::max(xy[ii], 0.0f);
        low[ii]  = xy[ii];
        high[ii] = low[ii] + 1;
        if(low[ii] >= dims[ii] - 1)
@@ -164,11 +165,12 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t,
        for(index_int ii = 0; ii < roi_size.size(); ++ii)
        {
            roi_size[ii] = roi_ends[ii] - roi_starts[ii];
-            roi_size[ii] = max(roi_size[ii], 1.0f);
+            roi_size[ii] = migraphx::max(roi_size[ii], 1.0f);
-            bin_size[ii] = roi_size[ii] / out_dims[ii];
+            bin_size[ii]      = roi_size[ii] / out_dims[ii];
-            bin_grid_size[ii] =
+            bin_grid_size[ii] = (s.sampling_ratio > 0)
-                (s.sampling_ratio > 0) ? s.sampling_ratio : std::ceil(roi_size[ii] / out_dims[ii]);
+                                    ? s.sampling_ratio
+                                    : migraphx::ceil(roi_size[ii] / out_dims[ii]);
        }
        const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -11,7 +11,7 @@ template <class T>
 struct tensor_view_iterator_read
 {
    T* view;
-    constexpr auto& operator()(std::size_t n) const
+    constexpr auto& operator()(index_int n) const
    {
        MIGRAPHX_ASSERT(view != nullptr);
        return (*view)[n];

--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
@@ -35,6 +35,21 @@ struct enable_if<true, T>
 template <bool B, class T = void>
 using enable_if_t = typename enable_if<B, T>::type;
+template <bool B, class T, class F>
+struct conditional
+{
+    using type = T;
+};
+template <class T, class F>
+struct conditional<false, T, F>
+{
+    using type = F;
+};
+template <bool B, class T, class F>
+using conditional_t = typename conditional<B, T, F>::type;
 // NOLINTNEXTLINE
 #define MIGRAPHX_BUILTIN_TYPE_TRAIT1(name)   \
    template <class T>                       \

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -79,7 +79,7 @@ __device__ __host__ auto as_vec(T* x)
 }
 template <class T, index_int N>
-using safe_vec = vec<std::conditional_t<std::is_same<T, bool>{}, uint8_t, T>, N>;
+using safe_vec = vec<conditional_t<is_same<T, bool>{}, uint8_t, T>, N>;
 template <class... Ts>
 constexpr auto vec_transform(Ts... xs)

--- a/src/targets/gpu/pack_int8_args.cpp
+++ b/src/targets/gpu/pack_int8_args.cpp
@@ -22,10 +22,10 @@ static instruction_ref pad_ins(module& m, instruction_ref ins, int offset)
    auto pad_k                     = (k + 3) / 4 * 4;
    auto pad_lens                  = lens;
    pad_lens[lens.size() + offset] = pad_k;
-    std::vector<int64_t> pad_dims(lens.size() * 2, 0);
+    auto ret_ins                   = ins;
-    auto ret_ins = ins;
    if(pad_k != k)
    {
+        std::vector<int64_t> pad_dims(lens.size() * 2, 0);
        pad_dims[lens.size() + offset] = pad_k - k;
        shape ps{s.type(), pad_lens};
        auto ins_out =

--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -3215,6 +3215,20 @@ def mean_test():
    return ([node], data, [mean])
+@onnx_test
+def mean_integral_test():
+    data = [
+        helper.make_tensor_value_info(str(i), TensorProto.INT32, [2, 2, 2])
+        for i in range(10)
+    ]
+    data_names = [str(i) for i in range(10)]
+    mean = helper.make_tensor_value_info('mean', TensorProto.INT32, [2, 2, 2])
+    node = onnx.helper.make_node("Mean", inputs=data_names, outputs=["mean"])
+    return ([node], data, [mean])
 @onnx_test
 def min_test():
    a = helper.make_tensor_value_info('0', TensorProto.FLOAT, [3])

--- a/test/onnx/mean_integral_test.onnx
+++ b/test/onnx/mean_integral_test.onnx
+mean_integral_test:Ö
+*
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9mean"Meanmean_integral_testZ
+0
+Z
+1
+Z
+2
+Z
+3
+Z
+4
+Z
+5
+Z
+6
+Z
+7
+Z
+8
+Z
+9
+b
+mean
+B
\ No newline at end of file