Merge branch 'develop' into resnet50_partition

70fe454f · Umang Yadav · ea0b8059 · f47e0b5b · 70fe454f · 70fe454f
Commit 70fe454f authored Oct 20, 2023 by Umang Yadav
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -76,7 +76,7 @@ include(ROCMSetupVersion)
 option(BUILD_DEV "Build for development purpose only" OFF)

 rocm_setup_version(VERSION 2.8.0)
-set(MIGRAPHX_SO_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH})
+set(MIGRAPHX_SO_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR})

 option( BUILD_SHARED_LIBS "Build as a shared library" ON )


--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -35,7 +35,7 @@ fastjsonschema==2.16.3
    # via rocm-docs-core
 gitdb==4.0.10
    # via gitpython
-gitpython==3.1.32
+gitpython==3.1.37
    # via rocm-docs-core
 idna==3.4
    # via requests
@@ -87,7 +87,7 @@ requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.24.2
+rocm-docs-core==0.26.0
    # via -r requirements.in
 smmap==5.0.0
    # via gitdb
@@ -130,7 +130,7 @@ sphinxcontrib-serializinghtml==1.1.5
    # via sphinx
 typing-extensions==4.5.0
    # via pydata-sphinx-theme
-urllib3==1.26.15
+urllib3==1.26.18
    # via requests
 wrapt==1.15.0
    # via deprecated
--- a/src/include/migraphx/argument.hpp
+++ b/src/include/migraphx/argument.hpp
@@ -46,7 +46,7 @@ struct MIGRAPHX_EXPORT argument : raw_data<argument>
 {
    argument() = default;

-    argument(const shape& s);
+    explicit argument(const shape& s);

    template <class F, MIGRAPHX_REQUIRES(std::is_pointer<decltype(std::declval<F>()())>{})>
    argument(shape s, F d)

--- a/src/include/migraphx/op/allocate.hpp
+++ b/src/include/migraphx/op/allocate.hpp
@@ -88,13 +88,13 @@ struct allocate
    {
        if(args.empty())
        {
-            return {output_shape};
+            return argument{output_shape};
        }
        else
        {
            std::vector<std::size_t> output_dims(output_shape.ndim());
            args.at(0).visit([&](auto a) { output_dims.assign(a.begin(), a.end()); });
-            return {shape{buf_type, output_dims}};
+            return argument{shape{buf_type, output_dims}};
        }
    }
 };

--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
@@ -411,7 +411,7 @@ struct pooling
            // for dynamic GlobalPooling, there's no padding
            kernel_dims.insert(kernel_dims.end(), input_lens.begin() + 2, input_lens.end());
            output_shape = dyn_out.computed_shape;
-            result       = dyn_out.computed_shape;
+            result       = argument{dyn_out.computed_shape};
        }
        else if((padding_mode != op::padding_mode_t::default_))
        {
@@ -439,7 +439,7 @@ struct pooling
        {
            kernel_dims  = this->lengths;
            output_shape = dyn_out.computed_shape;
-            result       = dyn_out.computed_shape;
+            result       = argument{dyn_out.computed_shape};
        }

        // Perform the computation and populate result

--- a/src/onnx/parse_mean_variance_normalization.cpp
+++ b/src/onnx/parse_mean_variance_normalization.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/onnx/op_parser.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/onnx/checks.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace onnx {
+
+struct parse_mean_variance_normalization : op_parser<parse_mean_variance_normalization>
+{
+    std::vector<op_desc> operators() const { return {{"MeanVarianceNormalization"}}; }
+
+    instruction_ref parse(const op_desc& /*opd*/,
+                          const onnx_parser& /*parser*/,
+                          onnx_parser::node_info info,
+                          std::vector<instruction_ref> args) const
+    {
+        auto&& data    = args.front();
+        auto data_rank = data->get_shape().ndim();
+        std::vector<int64_t> axes{0, 2, 3};
+
+        if(contains(info.attributes, "axes"))
+        {
+            const auto& axes_attr = info.attributes["axes"].ints();
+            axes.assign(axes_attr.begin(), axes_attr.end());
+        }
+        else if(data_rank != 4)
+        {
+            MIGRAPHX_THROW(
+                "Input tensor needs to be rank 4 when axes is not specified. Instead it is rank " +
+                std::to_string(data_rank));
+        }
+
+        if(axes.size() != data_rank - 1)
+        {
+            MIGRAPHX_THROW("Length of axes array needs to be equal to input tensor rank - 1");
+        }
+
+        auto data_mean = info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), data);
+        auto data_mean_squared = info.add_common_op("mul", data_mean, data_mean);
+
+        auto data_squared = info.add_common_op("mul", data, data);
+        auto data_squared_mean =
+            info.add_instruction(make_op("reduce_mean", {{"axes", axes}}), data_squared);
+
+        auto mean_sub = info.add_common_op("sub", data_squared_mean, data_mean_squared);
+        auto std      = info.add_common_op("sqrt", mean_sub);
+
+        auto dividend = info.add_common_op("sub", data, data_mean);
+        auto epsilon =
+            info.add_literal({data->get_shape().type(),
+                              {data->get_shape().type() == shape::half_type ? 1e-7 : 1e-9}});
+        auto divisor = info.add_common_op("add", std, epsilon);
+
+        return info.add_common_op("div", dividend, divisor);
+    }
+};
+
+} // namespace onnx
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/onnx/parse_trilu.cpp
+++ b/src/onnx/parse_trilu.cpp
@@ -56,9 +56,6 @@ struct parse_trilu : op_parser<parse_trilu>
            k = arg_k.at<int>();
        }

-        if(k < 0)
-            MIGRAPHX_THROW("PARSE_TRILU: negative k values not supported");
-
        if(contains(info.attributes, "upper"))
        {
            upper = static_cast<bool>(info.attributes.at("upper").i());
@@ -69,9 +66,12 @@ struct parse_trilu : op_parser<parse_trilu>
        // when creating the mask, if upper == 1,
        // the inner triangle will have values set to 0
        std::vector<bool> mask_mat(num_rows * num_cols, upper);
+        // if upper == 0, kth diagonal must also be masked
+        if(not upper)
+            k++;
        for(size_t i = 0; i < num_rows; i++)
        {
-            for(size_t j = 0; j < std::min(k, static_cast<int>(num_cols)); j++)
+            for(int j = 0; j < std::min(k, static_cast<int>(num_cols)); j++)
            {
                mask_mat[i * num_cols + j] = not upper;
            }

--- a/src/rewrite_quantization.cpp
+++ b/src/rewrite_quantization.cpp
@@ -33,6 +33,8 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK_WORKAROUNDS);
+
 void apply_quantizelinear(module& m, instruction_ref ins)
 {
    assert(ins->name() == "quantizelinear");
@@ -62,9 +64,22 @@ void apply_quantizelinear(module& m, instruction_ref ins)
        max_quant = qt.max();
        min_quant = qt.min();
    });
-    auto s        = add_zero_point->get_shape();
-    auto min_arg  = m.add_literal(literal{shape{s.type()}, {min_quant}});
-    auto max_arg  = m.add_literal(literal{shape{s.type()}, {max_quant}});
+    auto s = add_zero_point->get_shape();
+    instruction_ref min_arg;
+    instruction_ref max_arg;
+
+    if(enabled(MIGRAPHX_ENABLE_CK_WORKAROUNDS{}))
+    {
+        std::vector<int> min_data(s.elements(), min_quant);
+        std::vector<int> max_data(s.elements(), max_quant);
+        min_arg = m.add_literal(literal(s, min_data));
+        max_arg = m.add_literal(literal(s, max_data));
+    }
+    else
+    {
+        min_arg = m.add_literal(literal{shape{s.type()}, {min_quant}});
+        max_arg = m.add_literal(literal{shape{s.type()}, {max_quant}});
+    }
    auto saturate = insert_common_op(m, ins, make_op("clip"), {add_zero_point, min_arg, max_arg});
    m.replace_instruction(
        ins, make_op("convert", {{"target_type", ins->get_shape().type()}}), saturate);

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -139,6 +139,12 @@ void hip_compile_options::set_launch_params(
        global = compute_global(local);
 }

+static bool hip_accept_non_uniform_wg()
+{
+    static bool non_uniform_wg = hip_has_flags({"-fno-offload-uniform-block"});
+    return non_uniform_wg;
+}
+
 std::function<std::size_t(std::size_t local)>
 compute_global_for(context& ctx, std::size_t n, std::size_t over)
 {
@@ -146,13 +152,14 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
    std::size_t max_global = ctx.get_current_device().get_cu_count() *
                             ctx.get_current_device().get_max_workitems_per_cu();
    return [n, over, max_global](std::size_t local) {
-        // hip require global workitems multiple of local workitems. It may degrade performance.
-        // [TODO]: consider adding "fno-hip-uniform-block" flag when it becomes available.
-        // https://reviews.llvm.org/D155213
-        std::size_t num_elements = ((n + local - 1) / local) * local;
-        std::size_t groups       = (num_elements + local - 1) / local;
-        std::size_t max_blocks   = max_global / local;
-        std::size_t nglobal      = std::min(max_blocks * over, groups) * local;
+        std::size_t num_elements = n;
+        if(not hip_accept_non_uniform_wg())
+        {
+            num_elements = (1 + (n - 1) / local) * local;
+        }
+        std::size_t groups     = 1 + (num_elements - 1) / local;
+        std::size_t max_blocks = max_global / local;
+        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
        return std::min(nglobal, num_elements);
    };
 }
@@ -183,6 +190,11 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
        generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
    srcs.emplace_back("args.hpp", args_hpp);

+    if(options.global % options.local != 0 and hip_accept_non_uniform_wg())
+        options.params += " -fno-offload-uniform-block";
+    else
+        assert(options.global % options.local == 0);
+
    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
    options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
    options.params += " " + join_strings(compiler_warnings(), " ");

--- a/src/targets/gpu/fuse_ck.cpp
+++ b/src/targets/gpu/fuse_ck.cpp
@@ -26,6 +26,7 @@
 #include <migraphx/matcher.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/register_op.hpp>
+#include <migraphx/gpu/device_name.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -92,6 +93,8 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
    auto m = a.lens()[a.lens().size() - 2];
    auto n = b.lens().back();
    auto k = a.lens().back();
+    auto batch_size = std::accumulate(
+        a.lens().rbegin() + 2, a.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
    // Integer gemms must be divisible by 4 in ck
    if(contains({shape::int8_type, shape::int32_type}, ins->get_shape().type()))
    {
@@ -102,9 +105,17 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
        if(k % 4 != 0)
            return false;
    }
-    // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy
-    // to avoid poor-performing GEMM kernels from CK
-    // To-do: Investigate a more precise strategy
+    auto device_name = trim(split_string(get_device_name(), ':').front());
+    if(device_name == "gfx940")
+    {
+        if(ins->get_shape().type() == shape::half_type)
+        {
+            if(batch_size >= 64)
+                return m < 2048 or k <= 64 or n <= 384 or n >= 2048;
+            return true;
+        }
+        return true;
+    }
    return k <= 2048;
 }

@@ -140,6 +151,10 @@ struct find_ck_gemm_pointwise
               return not input->inputs().empty() and input->inputs().front()->name() == "capture";
           }))
            return;
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
+               return not input->inputs().empty() and input->inputs().front()->name() == "capture";
+           }))
+            return;
        assert(gemm_it != inputs.end());
        if(gemm_idx != 0)
        {

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -199,9 +199,9 @@ struct miopen_convolution
            // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
            preallocate = true;
 #endif
-            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : inputs[0];
-            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : inputs[1];
-            auto y = preallocate ? allocate_gpu(output_shape) : inputs[2];
+            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : argument{inputs[0]};
+            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : argument{inputs[1]};
+            auto y = preallocate ? allocate_gpu(output_shape) : argument{inputs[2]};
            auto workspace =
                preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);


--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -31,6 +31,14 @@
 #include <migraphx/kernels/debug.hpp>
 #include <migraphx/kernels/functional.hpp>

+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+extern "C" __device__ size_t __ockl_get_enqueued_local_size(uint); // NOLINT
+extern "C" __device__ size_t __ockl_get_local_size(uint);          // NOLINT
+#pragma clang diagnostic pop
+#endif
+
 namespace migraphx {

 #if defined(MIGRAPHX_NGLOBAL) && defined(MIGRAPHX_NLOCAL)
@@ -45,43 +53,37 @@ inline __device__ __attribute__((const)) index_int compute_global_size()
    // This actualy works even when global is not divisible by local size.
    // This doesnt actually do a multiplicatiosn. Instead it calls a device
    // function to get the global size, which is why it works.
-    return blockDim.x * gridDim.x;  // NOLINT
+    return blockDim.x * gridDim.x; // NOLINT
 #endif
 }

-// We cant just use blockDim.x to get the local size since its broken on hip
-// when global is not divisible by local size. In this case, we calulate the
-// size for the last group.
+#ifdef MIGRAPHX_NGROUP
+// If global is divisible by local then local can be a const
+#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
+#define MIGRAPHX_HAS_CONST_LOCAL 1
+#endif
+#endif
+
 inline __device__ __attribute__((const)) index_int compute_local_size()
 {
-#ifdef MIGRAPHX_NLOCAL
-    const auto nlocal = MIGRAPHX_NLOCAL;
-#else
-    const auto nlocal = blockDim.x; // NOLINT
-#endif
-#ifdef MIGRAPHX_NGROUP
-    const auto ngroup = MIGRAPHX_NGROUP;
+#ifdef MIGRAPHX_HAS_CONST_LOCAL
+    return MIGRAPHX_NLOCAL;
 #else
-    const auto ngroup = gridDim.x;  // NOLINT
+    // Returns block size. For the non-uniform block it returns the size of the non-uniform block.
+    return __ockl_get_local_size(0); // NOLINT
 #endif
-    const auto group_id = blockIdx.x; // NOLINT
-    const auto nglobal  = compute_global_size();
-    if(group_id == ngroup - 1)
-    {
-        return 1 + (nglobal - 1) % nlocal;
-    }
-    else
-    {
-        return nlocal; // NOLINT
-    }
 }

-#ifdef MIGRAPHX_NGROUP
-// If global is divisible by local then local can be a const
-#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
-#define MIGRAPHX_HAS_CONST_LOCAL 1
-#endif
+inline __device__ __attribute__((const)) index_int compute_max_local_size()
+{
+#ifdef MIGRAPHX_LOCAL
+    return MIGRAPHX_NLOCAL;
+#else
+    // Returns the block size. When workgrop has non-uniform block, this returns size of the uniform
+    // block.
+    return __ockl_get_enqueued_local_size(0); // NOLINT
 #endif
+}

 struct index
 {
@@ -126,8 +128,8 @@ struct index
 #else
    __device__ index_int max_nlocal() const
    {
-        MIGRAPHX_ASSERT(blockDim.x > 0);
-        return blockDim.x;
+        MIGRAPHX_ASSERT(compute_max_local_size() > 0);
+        return compute_max_local_size();
    }
 #endif

@@ -249,7 +251,8 @@ struct index
 #endif
 inline __device__ __attribute__((const)) index make_index()
 {
-    return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
+    return index{
+        blockIdx.x * compute_max_local_size() + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
 }

 } // namespace migraphx

--- a/test/eliminate_allocation_test.cpp
+++ b/test/eliminate_allocation_test.cpp
@@ -55,7 +55,7 @@ struct allocate
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };


--- a/test/eliminate_concat_test.cpp
+++ b/test/eliminate_concat_test.cpp
@@ -60,7 +60,7 @@ struct concat
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };

@@ -104,7 +104,7 @@ struct allocate
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };


--- a/test/memory_coloring_test.cpp
+++ b/test/memory_coloring_test.cpp
@@ -55,7 +55,7 @@ struct allocate
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };


--- a/test/normalize_ops_test.cpp
+++ b/test/normalize_ops_test.cpp
@@ -57,7 +57,7 @@ struct normalize_test_op
                               const migraphx::shape& output_shape,
                               const std::vector<migraphx::argument>&) const
    {
-        return {output_shape};
+        return migraphx::argument{output_shape};
    }
 };


--- a/test/onnx/.onnxrt-commit
+++ b/test/onnx/.onnxrt-commit
-6d7bc2a097a1a08541cd0d4628831c79ab8092d5
+635d3faa3b3908d2806d009dc6872152cfcfcdda
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -4681,6 +4681,77 @@ def mean_integral_test():
    return ([node], data, [mean])


+def mvn_default_axes_test_base(dims, type=TensorProto.FLOAT):
+    data = helper.make_tensor_value_info("data", type, dims)
+    out = helper.make_tensor_value_info("out", type, dims)
+    node = helper.make_node("MeanVarianceNormalization",
+                            inputs=["data"],
+                            outputs=["out"])
+
+    return ([node], [data], [out])
+
+
+@onnx_test()
+def mvn_default_axes_test():
+    return mvn_default_axes_test_base([2, 2, 2, 2])
+
+
+@onnx_test()
+def mvn_default_axes_fp16_test():
+    return mvn_default_axes_test_base([2, 2, 2, 2], TensorProto.FLOAT16)
+
+
+@onnx_test()
+def mvn_default_axes_rank_too_small_test():
+    return mvn_default_axes_test_base([2, 2, 2])
+
+
+@onnx_test()
+def mvn_default_axes_rank_too_big_test():
+    return mvn_default_axes_test_base([2, 2, 2, 2, 2])
+
+
+def mvn_n_rank_test_base(axes, dims, type=TensorProto.FLOAT):
+    data = helper.make_tensor_value_info("data", type, dims)
+    out = helper.make_tensor_value_info("out", type, dims)
+    node = helper.make_node("MeanVarianceNormalization",
+                            inputs=["data"],
+                            outputs=["out"],
+                            axes=axes)
+
+    return ([node], [data], [out])
+
+
+@onnx_test()
+def mvn_rank_2_test():
+    return mvn_n_rank_test_base([1], [2, 2])
+
+
+@onnx_test()
+def mvn_rank_2_fp16_test():
+    return mvn_n_rank_test_base([1], [2, 2], TensorProto.FLOAT16)
+
+
+@onnx_test()
+def mvn_rank_3_test():
+    return mvn_n_rank_test_base([0, 1], [2, 2, 2])
+
+
+@onnx_test()
+def mvn_rank_3_fp16_test():
+    return mvn_n_rank_test_base([0, 1], [2, 2, 2], TensorProto.FLOAT16)
+
+
+@onnx_test()
+def mvn_axes_rank_too_small_test():
+    return mvn_n_rank_test_base([0, 1, 2], [2, 2, 2])
+
+
+@onnx_test()
+def mvn_axes_rank_too_big_test():
+    return mvn_n_rank_test_base([0], [2, 2, 2])
+
+
 @onnx_test()
 def min_test():
    a = helper.make_tensor_value_info('0', TensorProto.FLOAT, [3])
@@ -8502,7 +8573,7 @@ def transpose_gather_test():


 @onnx_test()
-def trilu_test():
+def triu_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [3, 4])
    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3, 4])

@@ -8515,7 +8586,7 @@ def trilu_test():


 @onnx_test()
-def trilu_batch_diff_k_test():
+def triu_batch_diff_k_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 2, 3])
    k = np.array([2])
    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2, 3])
@@ -8533,7 +8604,24 @@ def trilu_batch_diff_k_test():


 @onnx_test()
-def trilu_lower_test():
+def tril_batch_diff_k_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [2, 2, 3])
+    k = np.array([2])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2, 3])
+    k_tensor = helper.make_tensor(name='k',
+                                  data_type=TensorProto.INT64,
+                                  dims=k.shape,
+                                  vals=k.astype(np.int64))
+
+    node = onnx.helper.make_node('Trilu',
+                                 inputs=['x', 'k'],
+                                 outputs=['y'],
+                                 upper=0)
+    return ([node], [x], [y], [k_tensor])
+
+
+@onnx_test()
+def tril_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [3, 4])
    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3, 4])

@@ -8542,7 +8630,7 @@ def trilu_lower_test():


 @onnx_test()
-def trilu_neg_k_test():
+def triu_neg_k_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [3, 4])
    k = np.array([-1])
    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3, 4])
@@ -8556,7 +8644,23 @@ def trilu_neg_k_test():


 @onnx_test()
-def trilu_out_k_test():
+def tril_neg_k_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [3, 4])
+    k = np.array([-1])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3, 4])
+    k_tensor = helper.make_tensor(name='k',
+                                  data_type=TensorProto.INT64,
+                                  dims=k.shape,
+                                  vals=k.astype(np.int64))
+    node = onnx.helper.make_node('Trilu',
+                                 inputs=['x', 'k'],
+                                 outputs=['y'],
+                                 upper=0)
+    return ([node], [x], [y], [k_tensor])
+
+
+@onnx_test()
+def triu_out_k_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [3, 4])
    k = np.array([5])
    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3, 4])
@@ -8570,7 +8674,23 @@ def trilu_out_k_test():


 @onnx_test()
-def trilu_row_one_test():
+def tril_out_k_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [3, 4])
+    k = np.array([5])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [3, 4])
+    k_tensor = helper.make_tensor(name='k',
+                                  data_type=TensorProto.INT64,
+                                  dims=k.shape,
+                                  vals=k.astype(np.int64))
+    node = onnx.helper.make_node('Trilu',
+                                 inputs=['x', 'k'],
+                                 outputs=['y'],
+                                 upper=0)
+    return ([node], [x], [y], [k_tensor])
+
+
+@onnx_test()
+def triu_row_one_test():
    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 4])
    k = np.array([1])
    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 4])
@@ -8587,6 +8707,23 @@ def trilu_row_one_test():
    return ([node], [x], [y], [k_tensor])


+@onnx_test()
+def tril_row_one_test():
+    x = helper.make_tensor_value_info('x', TensorProto.FLOAT, [1, 4])
+    k = np.array([1])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [1, 4])
+    k_tensor = helper.make_tensor(name='k',
+                                  data_type=TensorProto.INT64,
+                                  dims=k.shape,
+                                  vals=k.astype(np.int64))
+
+    node = onnx.helper.make_node('Trilu',
+                                 inputs=['x', 'k'],
+                                 outputs=['y'],
+                                 upper=0)
+    return ([node], [x], [y], [k_tensor])
+
+
 @onnx_test()
 def undefined_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [2, 3, 4, 5])

--- a/test/onnx/mvn_axes_rank_too_big_test.onnx
+++ b/test/onnx/mvn_axes_rank_too_big_test.onnx
--- a/test/onnx/mvn_axes_rank_too_small_test.onnx
+++ b/test/onnx/mvn_axes_rank_too_small_test.onnx