Merge branch 'develop' into simplify_1_mul_div_ops

32d69e8e · Ted Themistokleous · GitHub · 8398fb19 · bab9502a · 32d69e8e
Unverified Commit 32d69e8e authored Aug 13, 2022 by Ted Themistokleous Committed by GitHub Aug 13, 2022
20 changed files
--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -138,6 +138,8 @@ struct pointwise_compiler : compiler<pointwise_compiler>
            g.add_point_op("less", "migraphx::abs(${0} < ${1})");
            g.add_point_op("greater", "migraphx::abs(${0} > ${1})");
            g.add_point_op("not", "migraphx::abs(not ${0})");
+            g.add_point_op("mod", "migraphx::mod(${0}, ${1})");
+            g.add_point_op("fmod", "migraphx::fmod(${0}, ${1})");
            // Add explict conversions
            g.fresult([](const shape& s) {
                return "migraphx::convert<" + shape::cpp_type(s.type()) + ">";

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -44,9 +44,14 @@
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/gpu/perfdb.hpp>
 #include <deque>
 #include <variant>

+#if defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) && MLIR_MIGRAPHX_DIALECT_API_VERSION >= 2
+#define MIGRAPHX_MLIR_BARE_POINTER
+#endif
+
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
@@ -145,6 +150,12 @@ std::string mlir_print(F f, T x)
    return ss.str();
 }

+const std::unordered_set<std::string>& get_xdlops_archs()
+{
+    static std::unordered_set<std::string> supported_archs{"gfx908", "gfx90a"};
+    return supported_archs;
+}
+
 struct mlir_program
 {
    mlir_program()
@@ -487,6 +498,17 @@ struct mlir_program
            ops.add_attribute_value(get_operator_value(ins->get_operator()));
            if(ins->name() != "@return")
                ops.add_results({get_shape(ins)});
+            if(ins->name() == "convolution")
+            {
+                pp =
+                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
+                std::string tuned = get_tune_params();
+                if(!tuned.empty())
+                    ops.add_attributes({{"perf_config", tuned}});
+                // check if HW supports xdlops
+                if(contains(get_xdlops_archs(), target_name))
+                    ops.add_attributes({{"xdlopsV2", true}});
+            }

            std::vector<MlirValue> inputs;
            transform(
@@ -508,14 +530,7 @@ struct mlir_program
        // 1st pipeline to call
        mlirMIGraphXAddHighLevelPipeline(pm.get());
        // 2nd pipeline to call
-        std::string tname = get_device_name();
-        // HACK: Since MLIR can't handle the full target name
-        auto hacked_tname = tname.substr(0, tname.find(':'));
-        if(tname.size() != hacked_tname.size())
-            std::cout
-                << "*************** WARNING: MLIR may not compile the correct target features for: "
-                << tname << std::endl;
-        mlirMIGraphXAddBackendPipeline(pm.get(), hacked_tname.c_str(), "amdgcn-amd-amdhsa", "");
+        mlirMIGraphXAddBackendPipeline(pm.get(), target_name.c_str(), "amdgcn-amd-amdhsa", "");
        mlirPassManagerRun(pm.get(), mmodule.get());

        code_object_op op{};
@@ -525,6 +540,17 @@ struct mlir_program
        return op;
    }

+    void find_target()
+    {
+        std::string tname = get_device_name();
+        // HACK: Since MLIR can't handle the full target name
+        target_name = trim(split_string(tname, ':').front());
+        if(tname.size() != target_name.size())
+            std::cout
+                << "*************** WARNING: MLIR may not compile the correct target features for: "
+                << tname << std::endl;
+    }
+
    std::pair<std::size_t, std::size_t> get_launch_params() const
    {
        uint32_t attrs[2];
@@ -545,10 +571,14 @@ struct mlir_program
        MIGRAPHX_THROW("Failed to compile mlir program");
    }

+    std::string get_tune_params() { return get_mlir_perf_for_conv(pp); }
+
    mlir_context ctx;
    MlirLocation location;
    mlir_module mmodule;
+    problem_params pp;
    std::deque<std::string> strings{};
+    std::string target_name;
 };

 std::string dump_mlir(const module& m)
@@ -565,6 +595,7 @@ code_object_op compile_mlir(const context&, const module& m)
    if(trace)
        std::cout << m << std::endl;
    mlir_program mp;
+    mp.find_target();
    mp.parse(m);
    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
    if(trace)
@@ -579,9 +610,15 @@ instruction_ref insert_mlir(module& m,
                            code_object_op co,
                            const std::vector<instruction_ref>& inputs)
 {
+
    std::vector<instruction_ref> refs;
+    std::size_t last = 0;
+#ifdef MIGRAPHX_MLIR_BARE_POINTER
+    refs.reserve(inputs.size());
+    std::copy(inputs.begin(), inputs.end(), std::back_inserter(refs));
+    last = refs.size() - 1;
+#else
    refs.reserve(inputs.size() * 15);
-
    std::unordered_map<uint64_t, instruction_ref> literal_map{};
    auto get_literal = [&](uint64_t value) {
        auto fi = literal_map.find(value);
@@ -592,7 +629,6 @@ instruction_ref insert_mlir(module& m,
        return lit;
    };

-    std::size_t last = 0;
    for(auto input : inputs)
    {
        const size_t offset = 0;
@@ -616,6 +652,7 @@ instruction_ref insert_mlir(module& m,
                       [&](const auto& lval) { return get_literal(lval); });
        // refs.push_back(get_literal(1)); // G
    }
+#endif
    co.expected_inputs = to_shapes(refs);
    co.output_arg      = last;
    return m.insert_instruction(ins, co, refs);

--- a/src/targets/gpu/perfdb.cpp
+++ b/src/targets/gpu/perfdb.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/perfdb.hpp>
+#include <migraphx/value.hpp>
+#include <migraphx/sqlite.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/permutation.hpp>
+#include <fstream>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+namespace {
+
+std::string get_layout(const shape& s, std::string labels)
+{
+    auto result = labels;
+    auto p      = find_permutation(s);
+    std::transform(p.begin(), p.end(), result.begin(), [&](auto i) { return labels[i]; });
+    return "'" + result + "'";
+}
+
+std::string get_type(const shape& s)
+{
+    static const std::unordered_map<shape::type_t, std::string> m = {
+        {shape::float_type, "'FP32'"},
+        {shape::half_type, "'FP16'"},
+        {shape::double_type, "'FP64'"},
+        {shape::int8_type, "'INT8'"},
+        {shape::int32_type, "'INT32'"},
+    };
+    auto it = m.find(s.type());
+    if(it == m.end())
+        return "UNKNOWN";
+    return it->second;
+}
+
+std::string generate_miopen_config(const problem_params& pp)
+{
+    value v       = pp.op.to_value();
+    auto input    = pp.inputs[0].lens();
+    auto weights  = pp.inputs[1].lens();
+    auto padding  = v["padding"].to_vector<std::size_t>();
+    auto stride   = v["stride"].to_vector<std::size_t>();
+    auto dilation = v["dilation"].to_vector<std::size_t>();
+    if(padding.size() != stride.size())
+        padding.erase(padding.begin() + padding.size() / 2, padding.end());
+    return to_string_range({std::string{" C.in_channels="},       to_string(input[1]),
+                            std::string{" AND C.in_h="},          to_string(input[2]),
+                            std::string{" AND C.in_w="},          to_string(input[3]),
+                            std::string{" AND C.fil_h="},         to_string(weights[2]),
+                            std::string{" AND C.fil_w="},         to_string(weights[3]),
+                            std::string{" AND C.out_channels="},  to_string(weights[0]),
+                            std::string{" AND C.batchsize="},     to_string(input[0]),
+                            std::string{" AND C.pad_h="},         to_string(padding[0]),
+                            std::string{" AND C.pad_w="},         to_string(padding[2]),
+                            std::string{" AND C.dilation_h="},    to_string(dilation[0]),
+                            std::string{" AND C.dilation_w="},    to_string(dilation[1]),
+                            std::string{" AND C.conv_stride_h="}, to_string(stride[0]),
+                            std::string{" AND C.conv_stride_w="}, to_string(stride[1]),
+                            std::string{" AND C.layout="},        get_layout(pp.inputs[0], "NCHW"),
+                            std::string{" AND C.data_type="},     get_type(pp.inputs[0]),
+                            std::string{" AND C.direction="},     std::string{"'F'"}},
+                           " ");
+}
+
+auto query_miopen_db(const std::string& query)
+{
+    // TODO: Store db as a static variable
+    const auto dbpath = fs::path{"/opt"} / "rocm" / "share" / "miopen" / "db" / "miopen.db";
+    // Check if db file exists.
+    std::ifstream dbs(dbpath);
+    if(dbs.is_open())
+    {
+        dbs.close();
+    }
+    else
+    {
+        std::vector<std::unordered_map<std::string, std::string>> empty;
+        return empty;
+    }
+
+    auto db = sqlite::read(dbpath);
+    return db.execute(query);
+}
+
+} // namespace
+
+std::string get_mlir_perf_for_conv(const problem_params& pp)
+{
+    std::string query = "select P.* \
+                             from perf_db P, config C \
+                             where P.config = C.id AND \
+                             P.solver = 'ConvMlirIgemmFwdXdlops' AND \
+                             ${config}";
+
+    auto results =
+        query_miopen_db(interpolate_string(query, {{"config", generate_miopen_config(pp)}}));
+    if(results.empty())
+        return "";
+    return results.front().at("params");
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -51,6 +51,8 @@
 #include <migraphx/register_op.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/tune_axis.hpp>
+#include <migraphx/pad_calc.hpp>
+
 #include <unordered_map>
 #include <utility>
 #include <iostream>
@@ -231,8 +233,31 @@ struct ref_convolution : auto_register_op<ref_convolution<Op>>
    {
        return op.normalize_compute_shape(inputs);
    }
+
    argument compute(context&, shape output_shape, std::vector<argument> args) const
    {
+        std::vector<std::size_t> padding;
+        if(op.use_dynamic_same_auto_pad)
+        {
+            auto input_lens = args[0].get_shape().lens();
+            std::vector<std::size_t> img_lens{input_lens.begin() + 2, input_lens.end()};
+            auto weights_lens = args[1].get_shape().lens();
+            std::vector<std::size_t> k_lens{weights_lens.begin() + 2, weights_lens.end()};
+            padding = calc_dyn_auto_pad(img_lens, k_lens, op.stride, op.dilation);
+            std::cout << "[ ";
+            output_shape =
+                compute_padded_shape({args.at(0).get_shape(), args.at(1).get_shape()}, padding);
+        }
+        else
+        {
+            padding = op.padding;
+            if(output_shape.dynamic())
+            {
+                output_shape =
+                    op.normalize_compute_shape({args.at(0).get_shape(), args.at(1).get_shape()});
+            }
+        }
+
        argument result{output_shape};
        visit_quantize(result, args[0], args[1])([&](auto output, auto input, auto weights) {
            auto in_lens = input.get_shape().lens();
@@ -252,7 +277,7 @@ struct ref_convolution : auto_register_op<ref_convolution<Op>>
                {
                    auto d_2 = dim - 2;
                    win_start.push_back(std::ptrdiff_t(idx_o[dim] * op.stride[d_2]) -
-                                        std::ptrdiff_t(op.padding[d_2]));
+                                        std::ptrdiff_t(padding[d_2]));
                }
                const auto group_id = w / (wei_n / op.group);

@@ -289,6 +314,34 @@ struct ref_convolution : auto_register_op<ref_convolution<Op>>
        });
        return result;
    }
+
+    private:
+    /*!
+     * Used for dynamic auto padding since padding needs to be computed at evaulation time.
+     * \param inputs two fixed shape inputs [input_tensor, weights]
+     * \param padding from auto_pad calculation
+     */
+    shape compute_padded_shape(const std::vector<shape>& inputs,
+                               const std::vector<std::size_t>& padding) const
+    {
+        const shape& input            = inputs.at(0);
+        const shape& weights          = inputs.at(1);
+        const size_t num_spatial_dims = input.lens().size() - 2;
+
+        std::vector<size_t> output_lens{input.lens()[0], weights.lens()[0]};
+        // calculate the output shape of the convolution: ((W - K + 2P) / S) + 1
+        for(size_t i = 0; i < num_spatial_dims; i++)
+        {
+            auto padding_factor = padding[i] + padding[i + num_spatial_dims];
+            output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(
+                1,
+                (input.lens()[i + 2] - (1 + op.dilation[i] * (weights.lens()[i + 2] - 1)) +
+                 padding_factor) /
+                        op.stride[i] +
+                    1)));
+        }
+        return inputs[0].with_lens(output_lens);
+    }
 };

 struct ref_im2col

--- a/test/gpu/manage_host_buffer.cpp
+++ b/test/gpu/manage_host_buffer.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <iostream>
+#include <vector>
+#include <hip/hip_runtime_api.h>
+#include <migraphx/gpu/target.hpp>
+#include <migraphx/verify.hpp>
+#include <test.hpp>
+#include <basic_ops.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/make_op.hpp>
+
+#define MIGRAPHX_HIP_ASSERT(x) (EXPECT(x == hipSuccess))
+
+TEST_CASE(host_same_buffer_copy)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape ss{migraphx::shape::float_type, {4, 2}};
+    auto a           = mm->add_parameter("a", ss);
+    auto b           = mm->add_parameter("b", ss);
+    auto aa          = mm->add_instruction(migraphx::make_op("add"), a, a);
+    auto gpu_out     = mm->add_instruction(migraphx::make_op("hip::copy_from_gpu"), aa);
+    auto stream_sync = mm->add_instruction(migraphx::make_op("hip::sync_stream"), gpu_out);
+    auto pass        = mm->add_instruction(unary_pass_op{}, stream_sync);
+    auto alloc       = mm->add_instruction(
+        migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ss)}}));
+    auto gpu_in = mm->add_instruction(migraphx::make_op("hip::copy_to_gpu"), pass, alloc);
+    auto aab    = mm->add_instruction(migraphx::make_op("add"), gpu_in, b);
+    mm->add_return({aab});
+    migraphx::parameter_map pp;
+    std::vector<float> a_vec(ss.elements(), -1);
+    std::vector<float> b_vec(ss.elements(), 2);
+    std::vector<float> c_vec(ss.elements(), 0);
+    pp["a"] = migraphx::argument(ss, a_vec.data());
+    pp["b"] = migraphx::argument(ss, b_vec.data());
+    std::vector<float> gpu_result;
+    migraphx::target gpu_t = migraphx::gpu::target{};
+    migraphx::compile_options options;
+    options.offload_copy = true;
+    p.compile(gpu_t, options);
+    auto result = p.eval(pp).back();
+    std::vector<float> results_vector(ss.elements(), -1);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify_range(c_vec, results_vector));
+}
+
+TEST_CASE(arguments_lifetime)
+{
+    auto use_on_gpu = [](const migraphx::argument& arg, int c) {
+        auto* arg_ptr = arg.data();
+        MIGRAPHX_HIP_ASSERT(hipSetDevice(0));
+        MIGRAPHX_HIP_ASSERT(hipMemset(arg_ptr, c, arg.get_shape().bytes()));
+        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
+        return;
+    };
+
+    auto f = [use_on_gpu](const migraphx::argument& input) {
+        auto a = migraphx::gpu::register_on_gpu(input);
+        auto s = a.get_shape();
+        {
+            auto b = migraphx::gpu::register_on_gpu(input);
+            use_on_gpu(b, 0);
+            std::vector<float> expected_b(s.elements(), 0);
+            auto gold = migraphx::argument(s, expected_b.data());
+        }
+        use_on_gpu(a, 1);
+        return true;
+    };
+
+    migraphx::shape ss{migraphx::shape::float_type, {4, 2}};
+    std::vector<float> x_data(ss.elements(), -1);
+    migraphx::argument x{ss, x_data.data()};
+    EXPECT(f(x));
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -145,7 +145,7 @@ TEST_CASE(conv)
    const std::string mlir_output = R"__migraphx__(
 module {
  func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
-    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
+    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1], use_dynamic_same_auto_pad = 0 : i64} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    return %0 : tensor<1x2x2x2xf32>
  }
 }
@@ -168,7 +168,7 @@ TEST_CASE(conv_add_relu)
    const std::string mlir_output = R"__migraphx__(
 module {
  func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
-    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
+    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1], use_dynamic_same_auto_pad = 0 : i64} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    %1 = migraphx.add(%0, %arg0) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
    %2 = migraphx.relu(%1) : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
    return %2 : tensor<1x2x2x2xf32>

--- a/test/onnx/constant_empty_scalar_int64_test.onnx
+++ b/test/onnx/constant_empty_scalar_int64_test.onnx
--- a/test/onnx/constant_one_val_int64_test.onnx
+++ b/test/onnx/constant_one_val_int64_test.onnx
--- a/test/onnx/conv_dynamic_batch_same_upper_test.onnx
+++ b/test/onnx/conv_dynamic_batch_same_upper_test.onnx
--- a/test/onnx/conv_dynamic_batch_test.onnx
+++ b/test/onnx/conv_dynamic_batch_test.onnx
--- a/test/onnx/conv_dynamic_img_and_weights_test.onnx
+++ b/test/onnx/conv_dynamic_img_and_weights_test.onnx
--- a/test/onnx/conv_dynamic_img_same_upper_test.onnx
+++ b/test/onnx/conv_dynamic_img_same_upper_test.onnx
--- a/test/onnx/conv_dynamic_img_test.onnx
+++ b/test/onnx/conv_dynamic_img_test.onnx
--- a/test/onnx/conv_dynamic_kernel_same_lower_test.onnx
+++ b/test/onnx/conv_dynamic_kernel_same_lower_test.onnx
--- a/test/onnx/conv_dynamic_weights_test.onnx
+++ b/test/onnx/conv_dynamic_weights_test.onnx
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -626,6 +626,46 @@ def constant_scalar_test():
    return ([node], [], [y])


+@onnx_test
+def constant_empty_scalar_int64_test():
+    x = np.array([]).astype(np.int64)
+    y = helper.make_tensor_value_info('0', TensorProto.INT64, [0])
+
+    node = onnx.helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['0'],
+        value=onnx.helper.make_tensor(
+            name='one_element_tensor',
+            data_type=TensorProto.INT64,
+            dims=x.shape,
+            vals=x.flatten().astype(np.int64),
+        ),
+    )
+
+    return ([node], [], [y])
+
+
+@onnx_test
+def constant_one_val_int64_test():
+    x = np.array([1]).astype(np.int64)
+    y = helper.make_tensor_value_info('0', TensorProto.INT64, [0])
+
+    node = onnx.helper.make_node(
+        'Constant',
+        inputs=[],
+        outputs=['0'],
+        value=onnx.helper.make_tensor(
+            name='empty_tensor',
+            data_type=TensorProto.INT64,
+            dims=x.shape,
+            vals=x.flatten().astype(np.int64),
+        ),
+    )
+
+    return ([node], [], [y])
+
+
 @onnx_test
 def const_of_shape_empty_input_test():
    tensor_val = onnx.helper.make_tensor('value', onnx.TensorProto.INT64, [1],
@@ -851,6 +891,96 @@ def conv_bn_relu_maxpool_test():
    return ([node0, node1, node2, node3], [x, y, z, m, n, k, l], [out])


+@onnx_test
+def conv_dynamic_batch_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [None, 3, 5, 5])
+    y = helper.make_tensor_value_info('1', TensorProto.FLOAT, [1, 3, 3, 3])
+    out = helper.make_tensor_value_info('2', TensorProto.FLOAT,
+                                        [None, 1, 3, 3])
+
+    node = onnx.helper.make_node('Conv', inputs=['0', '1'], outputs=['2'])
+    return ([node], [x, y], [out])
+
+
+@onnx_test
+def conv_dynamic_img_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT,
+                                      [1, 3, None, None])
+    y = helper.make_tensor_value_info('1', TensorProto.FLOAT, [1, 3, 3, 3])
+    out = helper.make_tensor_value_info('2', TensorProto.FLOAT,
+                                        [1, 1, None, None])
+
+    node = onnx.helper.make_node('Conv', inputs=['0', '1'], outputs=['2'])
+    return ([node], [x, y], [out])
+
+
+@onnx_test
+def conv_dynamic_weights_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 3, 5, 5])
+    y = helper.make_tensor_value_info('1', TensorProto.FLOAT,
+                                      [1, 3, None, None])
+    out = helper.make_tensor_value_info('2', TensorProto.FLOAT,
+                                        [1, 1, None, None])
+
+    node = onnx.helper.make_node('Conv', inputs=['0', '1'], outputs=['2'])
+    return ([node], [x, y], [out])
+
+
+@onnx_test
+def conv_dynamic_img_and_weights_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT,
+                                      [1, 3, None, None])
+    y = helper.make_tensor_value_info('1', TensorProto.FLOAT,
+                                      [1, 3, None, None])
+    out = helper.make_tensor_value_info('2', TensorProto.FLOAT,
+                                        [1, 1, None, None])
+
+    node = onnx.helper.make_node('Conv', inputs=['0', '1'], outputs=['2'])
+    return ([node], [x, y], [out])
+
+
+@onnx_test
+def conv_dynamic_batch_same_upper_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [None, 3, 5, 5])
+    y = helper.make_tensor_value_info('1', TensorProto.FLOAT, [1, 3, 3, 3])
+    out = helper.make_tensor_value_info('2', TensorProto.FLOAT, [1, 1, 5, 5])
+
+    node = onnx.helper.make_node('Conv',
+                                 inputs=['0', '1'],
+                                 outputs=['2'],
+                                 auto_pad='SAME_UPPER')
+    return ([node], [x, y], [out])
+
+
+@onnx_test
+def conv_dynamic_img_same_upper_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT,
+                                      [1, 3, None, None])
+    y = helper.make_tensor_value_info('1', TensorProto.FLOAT, [1, 3, 3, 3])
+    out = helper.make_tensor_value_info('2', TensorProto.FLOAT,
+                                        [1, 1, None, None])
+
+    node = onnx.helper.make_node('Conv',
+                                 inputs=['0', '1'],
+                                 outputs=['2'],
+                                 auto_pad='SAME_UPPER')
+    return ([node], [x, y], [out])
+
+
+@onnx_test
+def conv_dynamic_kernel_same_lower_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 3, 5, 5])
+    y = helper.make_tensor_value_info('1', TensorProto.FLOAT,
+                                      [1, 3, None, None])
+    out = helper.make_tensor_value_info('2', TensorProto.FLOAT, [1, 1, 5, 5])
+
+    node = onnx.helper.make_node('Conv',
+                                 inputs=['0', '1'],
+                                 outputs=['2'],
+                                 auto_pad='SAME_LOWER')
+    return ([node], [x, y], [out])
+
+
 @onnx_test
 def conv_relu_maxpool_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 3, 32, 32])
@@ -2496,6 +2626,62 @@ def instance_norm_test():
    return ([node], [x, scale, bias], [y])


+@onnx_test
+def instance_norm_half_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT16, [1, 2, 3, 3])
+    scale = helper.make_tensor_value_info('1', TensorProto.FLOAT16, [2])
+    bias = helper.make_tensor_value_info('2', TensorProto.FLOAT16, [2])
+    y = helper.make_tensor_value_info('3', TensorProto.FLOAT16, [1, 2, 3, 3])
+
+    node = onnx.helper.make_node('InstanceNormalization',
+                                 inputs=['0', '1', '2'],
+                                 outputs=['3'])
+
+    return ([node], [x, scale, bias], [y])
+
+
+@onnx_test
+def instance_norm_type_mismatch_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 2, 3, 3])
+    scale = helper.make_tensor_value_info('1', TensorProto.FLOAT16, [2])
+    bias = helper.make_tensor_value_info('2', TensorProto.FLOAT16, [2])
+    y = helper.make_tensor_value_info('3', TensorProto.FLOAT, [1, 2, 3, 3])
+
+    node = onnx.helper.make_node('InstanceNormalization',
+                                 inputs=['0', '1', '2'],
+                                 outputs=['3'])
+
+    return ([node], [x, scale, bias], [y])
+
+
+@onnx_test
+def instance_norm_invalid_type_test():
+    x = helper.make_tensor_value_info('0', TensorProto.INT32, [1, 2, 3, 3])
+    scale = helper.make_tensor_value_info('1', TensorProto.FLOAT, [2])
+    bias = helper.make_tensor_value_info('2', TensorProto.FLOAT, [2])
+    y = helper.make_tensor_value_info('3', TensorProto.FLOAT, [1, 2, 3, 3])
+
+    node = onnx.helper.make_node('InstanceNormalization',
+                                 inputs=['0', '1', '2'],
+                                 outputs=['3'])
+
+    return ([node], [x, scale, bias], [y])
+
+
+@onnx_test
+def instance_norm_nonbroadcastable_test():
+    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [1, 2, 3, 3])
+    scale = helper.make_tensor_value_info('1', TensorProto.FLOAT, [4])
+    bias = helper.make_tensor_value_info('2', TensorProto.FLOAT, [4])
+    y = helper.make_tensor_value_info('3', TensorProto.FLOAT, [1, 2, 3, 3])
+
+    node = onnx.helper.make_node('InstanceNormalization',
+                                 inputs=['0', '1', '2'],
+                                 outputs=['3'])
+
+    return ([node], [x, scale, bias], [y])
+
+
 @onnx_test
 def instance_norm_val_test():
    x = np.array([[[[0, 1, 2], [3, 4, 5], [6, 7, 8]],
@@ -3231,6 +3417,89 @@ def min_test():
    return ([node], [a, b, c], [y])


+@onnx_test
+def mod_test():
+    a = helper.make_tensor_value_info('0', TensorProto.INT32, [3, 3, 3])
+    b = helper.make_tensor_value_info('1', TensorProto.INT32, [3, 3, 3])
+    y = helper.make_tensor_value_info('2', TensorProto.INT32, [3, 3, 3])
+
+    node = onnx.helper.make_node('Mod', inputs=['0', '1'], outputs=['2'])
+
+    return ([node], [a, b], [y])
+
+
+@onnx_test
+def mod_test_half():
+    a = helper.make_tensor_value_info('0', TensorProto.FLOAT16, [3, 3, 3])
+    b = helper.make_tensor_value_info('1', TensorProto.FLOAT16, [3, 3, 3])
+    y = helper.make_tensor_value_info('2', TensorProto.FLOAT16, [3, 3, 3])
+
+    node = onnx.helper.make_node('Mod', inputs=['0', '1'], outputs=['2'])
+
+    return ([node], [a, b], [y])
+
+
+@onnx_test
+def mod_test_different_dtypes():
+    a = helper.make_tensor_value_info('0', TensorProto.INT16, [3, 3, 3])
+    b = helper.make_tensor_value_info('1', TensorProto.INT32, [3, 3, 3])
+    y = helper.make_tensor_value_info('2', TensorProto.INT32, [3, 3, 3])
+
+    node = onnx.helper.make_node(
+        'Mod',
+        inputs=['0', '1'],
+        outputs=['2'],
+    )
+
+    return ([node], [a, b], [y])
+
+
+@onnx_test
+def mod_test_fmod():
+    a = helper.make_tensor_value_info('0', TensorProto.FLOAT, [3, 3, 3])
+    b = helper.make_tensor_value_info('1', TensorProto.FLOAT, [3, 3, 3])
+    y = helper.make_tensor_value_info('2', TensorProto.FLOAT, [3, 3, 3])
+
+    node = onnx.helper.make_node(
+        'Mod',
+        inputs=['0', '1'],
+        outputs=['2'],
+        fmod=1  #fmod flag = 1
+    )
+
+    return ([node], [a, b], [y])
+
+
+@onnx_test
+def mod_test_fmod_half():
+    a = helper.make_tensor_value_info('0', TensorProto.FLOAT16, [3, 3, 3])
+    b = helper.make_tensor_value_info('1', TensorProto.FLOAT16, [3, 3, 3])
+    y = helper.make_tensor_value_info('2', TensorProto.FLOAT16, [3, 3, 3])
+
+    node = onnx.helper.make_node('Mod',
+                                 inputs=['0', '1'],
+                                 outputs=['2'],
+                                 fmod=1)
+
+    return ([node], [a, b], [y])
+
+
+@onnx_test
+def mod_test_fmod_different_dtypes():
+    a = helper.make_tensor_value_info('0', TensorProto.FLOAT, [3, 3, 3])
+    b = helper.make_tensor_value_info('1', TensorProto.INT32, [3, 3, 3])
+    y = helper.make_tensor_value_info('2', TensorProto.FLOAT, [3, 3, 3])
+
+    node = onnx.helper.make_node(
+        'Mod',
+        inputs=['0', '1'],
+        outputs=['2'],
+        fmod=1  #fmod flag = 1
+    )
+
+    return ([node], [a, b], [y])
+
+
 @onnx_test
 def multinomial_test():
    sample_size = 10

--- a/test/onnx/instance_norm_half_test.onnx
+++ b/test/onnx/instance_norm_half_test.onnx
+instance_norm_half_test:
+#
+0
+1
+23"InstanceNormalizationinstance_norm_half_testZ
+0
+
+
+
+
+
+Z
+1
+
+
+
+Z
+2
+
+
+
+b
+3
+
+
+
+
+
+B
\ No newline at end of file
--- a/test/onnx/instance_norm_invalid_type_test.onnx
+++ b/test/onnx/instance_norm_invalid_type_test.onnx
+instance_norm_invalid_type_test:
+#
+0
+1
+23"InstanceNormalizationinstance_norm_invalid_type_testZ
+0
+
+
+
+
+Z
+1
+
+
+Z
+2
+
+
+b
+3
+
+
+
+
+B
\ No newline at end of file
--- a/test/onnx/instance_norm_nonbroadcastable_test.onnx
+++ b/test/onnx/instance_norm_nonbroadcastable_test.onnx
+#instance_norm_nonbroadcastable_test:
+#
+0
+1
+23"InstanceNormalization#instance_norm_nonbroadcastable_testZ
+0
+
+
+
+
+Z
+1
+
+
+Z
+2
+
+
+b
+3
+
+
+
+
+B
\ No newline at end of file
--- a/test/onnx/instance_norm_type_mismatch_test.onnx
+++ b/test/onnx/instance_norm_type_mismatch_test.onnx
+ instance_norm_type_mismatch_test:
+#
+0
+1
+23"InstanceNormalization instance_norm_type_mismatch_testZ
+0
+
+
+
+
+Z
+1
+
+
+
+Z
+2
+
+
+
+b
+3
+
+
+
+
+B
\ No newline at end of file