Merge branch 'develop' into workspace_size

5a14c0bf · umangyadav · cb01e280 · 5fa42993 · 5a14c0bf · 5a14c0bf
Commit 5a14c0bf authored Oct 19, 2022 by umangyadav
20 changed files
--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -72,7 +72,7 @@ bool memory_coloring_impl::allocate(interval_ptr interval)

    if(conflict_table.find(vn) != conflict_table.end())
    {
-        std::set<int>& vn_set = conflict_table[vn];
+        const std::set<int>& vn_set = conflict_table[vn];
        for(const auto& iter : vn_set)
        {
            live_range* range = live_ranges[iter];
@@ -267,8 +267,8 @@ void memory_coloring_impl::verify()
    {
        for(int i = 0; i < num_of_lives; ++i)
        {
-            live_interval& interval = live_intervals[i];
-            live_range& segment     = interval.segment;
+            const live_interval& interval = live_intervals[i];
+            const live_range& segment     = interval.segment;

            if(segment.begin == invalid_offset)
            {
@@ -284,7 +284,7 @@ void memory_coloring_impl::verify()
            int vn = segment.vn;
            if(conflict_table.find(vn) != conflict_table.end())
            {
-                std::set<int>& vn_set = conflict_table[vn];
+                const std::set<int>& vn_set = conflict_table[vn];
                for(const auto& iter : vn_set)
                {
                    live_range* range = live_ranges[iter];
@@ -319,8 +319,8 @@ void memory_coloring_impl::dump_intervals()
        {
            std::cout << " segment:" << i;
            std::cout << " =>";
-            std::set<int>& table = conflict_table[i];
-            for(auto& iter : table)
+            const std::set<int>& table = conflict_table[i];
+            for(const auto& iter : table)
            {
                std::cout << (iter) << ",";
            }
@@ -357,7 +357,7 @@ void live_interval::dump()
    std::cout << "id:" << id;
    segment.dump();
    std::cout << " uses:";
-    for(auto& iter : use_points)
+    for(const auto& iter : use_points)
    {
        std::cout << " " << get_ins_enum(iter) << ",";
    }

--- a/src/pad_calc.cpp
+++ b/src/pad_calc.cpp
@@ -52,19 +52,21 @@ void calculate_padding(int64_t idx,
    }
 }

-std::vector<std::size_t> calc_dyn_auto_pad(std::vector<std::size_t> tensor_lens,
-                                           std::vector<std::size_t> k_lens,
-                                           std::vector<std::size_t> strides,
-                                           std::vector<std::size_t> dilations,
+std::vector<std::size_t> calc_dyn_auto_pad(const std::vector<std::size_t>& input_lens,
+                                           const std::vector<std::size_t>& wei_lens,
+                                           const std::vector<std::size_t>& strides,
+                                           const std::vector<std::size_t>& dilations,
                                           bool use_upper)
 {
    std::vector<std::size_t> padding;
-    padding.resize(2 * k_lens.size());
-    for(std::size_t i = 0; i < padding.size() / 2; i++)
+    assert(input_lens.size() >= 3);
+    std::size_t num_spatial_dims = input_lens.size() - 2;
+    padding.resize(2 * num_spatial_dims);
+    for(std::size_t i = 0; i < num_spatial_dims; i++)
    {
-        std::ptrdiff_t input_dim      = tensor_lens[i];
+        std::ptrdiff_t input_dim      = input_lens[i + 2];
        std::ptrdiff_t stride         = strides[i];
-        std::ptrdiff_t weight_dim     = k_lens[i];
+        std::ptrdiff_t weight_dim     = wei_lens[i + 2];
        std::ptrdiff_t dilation       = dilations[i];
        std::ptrdiff_t output_dim     = (input_dim + stride - 1) / stride; // round up result
        std::ptrdiff_t new_weight_dim = weight_dim + (weight_dim - 1) * (dilation - 1);
@@ -86,5 +88,28 @@ std::vector<std::size_t> calc_dyn_auto_pad(std::vector<std::size_t> tensor_lens,
    return padding;
 }

+shape compute_padded_shape(const shape& input,
+                           const shape& weights,
+                           const std::vector<std::size_t>& padding,
+                           const std::vector<std::size_t>& stride,
+                           const std::vector<std::size_t>& dilation)
+{
+    const size_t num_spatial_dims = input.lens().size() - 2;
+
+    std::vector<size_t> output_lens{input.lens()[0], weights.lens()[0]};
+    // calculate the output shape of the convolution: ((W - K + 2P) / S) + 1
+    for(size_t i = 0; i < num_spatial_dims; ++i)
+    {
+        auto padding_factor = padding[i] + padding[i + num_spatial_dims];
+        output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(
+            1,
+            (input.lens()[i + 2] - (1 + dilation[i] * (weights.lens()[i + 2] - 1)) +
+             padding_factor) /
+                    stride[i] +
+                1)));
+    }
+    return input.with_lens(output_lens);
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -398,7 +398,7 @@ std::vector<argument> generic_eval(const program& p,
    return generic_eval(mm, ctx, params, {}, make_trace);
 }

-std::vector<argument> program::eval(parameter_map params) const
+std::vector<argument> program::eval(parameter_map params, execution_environment exec_env) const
 {
    auto& ctx = this->impl->ctx;
 #ifndef NDEBUG
@@ -423,6 +423,12 @@ std::vector<argument> program::eval(parameter_map params) const
 #endif

    auto trace_level = value_of(MIGRAPHX_TRACE_EVAL{});
+    std::vector<argument> ret;
+
+    if(exec_env.async)
+    {
+        ctx.wait_for(exec_env.queue);
+    }

    if(trace_level > 0)
    {
@@ -434,49 +440,56 @@ std::vector<argument> program::eval(parameter_map params) const
            ins_out[x] = ss.str();
        });

-        return generic_eval(*this,
-                            ctx,
-                            std::move(params),
-                            with_check_context([&](auto& ins, auto f, auto&& check_context) {
-                                ctx.finish();
-                                std::cout << "Run instruction: " << ins_out.at(ins) << std::endl;
-                                timer t{};
-                                auto result = check_context(f);
-                                double t1   = t.record<milliseconds>();
-                                ctx.finish();
-                                double t2 = t.record<milliseconds>();
-                                std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
-                                if(trace_level > 1 and ins->name().front() != '@' and
-                                   ins->name() != "load" and not result.empty())
-                                {
-                                    target tgt  = make_target(this->impl->target_name);
-                                    auto buffer = tgt.copy_from(result);
-                                    if(trace_level == 2)
-                                    {
-                                        std::cout << "Output has "
-                                                  << to_string_range(classify_argument(buffer))
-                                                  << std::endl;
-                                        std::cout << "Output: ";
-                                        preview_argument(std::cout, buffer);
-                                        std::cout << std::endl;
-                                    }
-                                    else
-                                    {
-                                        std::cout << "Output: " << buffer << std::endl;
-                                    }
-                                }
-                                return result;
-                            }));
+        ret = generic_eval(*this,
+                           ctx,
+                           std::move(params),
+                           with_check_context([&](auto& ins, auto f, auto&& check_context) {
+                               ctx.finish();
+                               std::cout << "Run instruction: " << ins_out.at(ins) << std::endl;
+                               timer t{};
+                               auto result = check_context(f);
+                               double t1   = t.record<milliseconds>();
+                               ctx.finish();
+                               double t2 = t.record<milliseconds>();
+                               std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
+                               if(trace_level > 1 and ins->name().front() != '@' and
+                                  ins->name() != "load" and not result.empty())
+                               {
+                                   target tgt  = make_target(this->impl->target_name);
+                                   auto buffer = tgt.copy_from(result);
+                                   if(trace_level == 2)
+                                   {
+                                       std::cout << "Output has "
+                                                 << to_string_range(classify_argument(buffer))
+                                                 << std::endl;
+                                       std::cout << "Output: ";
+                                       preview_argument(std::cout, buffer);
+                                       std::cout << std::endl;
+                                   }
+                                   else
+                                   {
+                                       std::cout << "Output: " << buffer << std::endl;
+                                   }
+                               }
+                               return result;
+                           }));
    }
    else
    {
-        return generic_eval(*this,
-                            ctx,
-                            std::move(params),
-                            with_check_context([&](auto&, auto f, auto&& check_context) {
-                                return check_context(f);
-                            }));
+        ret = generic_eval(*this,
+                           ctx,
+                           std::move(params),
+                           with_check_context([&](auto&, auto f, auto&& check_context) {
+                               return check_context(f);
+                           }));
    }
+
+    if(exec_env.async)
+    {
+        ctx.finish_on(exec_env.queue);
+    }
+
+    return ret;
 }

 const int program_file_version = 5;

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -264,12 +264,13 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)

    py::class_<migraphx::argument>(m, "argument", py::buffer_protocol())
        .def_buffer([](migraphx::argument& x) -> py::buffer_info { return to_buffer_info(x); })
-        .def("__init__",
-             [](migraphx::argument& x, py::buffer b) {
-                 py::buffer_info info = b.request();
-                 new(&x) migraphx::argument(to_shape(info), info.ptr);
-             })
+        .def(py::init([](py::buffer b) {
+            py::buffer_info info = b.request();
+            return migraphx::argument(to_shape(info), info.ptr);
+        }))
        .def("get_shape", &migraphx::argument::get_shape)
+        .def("data_ptr",
+             [](migraphx::argument& x) { return reinterpret_cast<std::uintptr_t>(x.data()); })
        .def("tolist",
             [](migraphx::argument& x) {
                 py::list l{x.get_shape().elements()};
@@ -354,6 +355,23 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
                 }
                 return p.eval(pm);
             })
+        .def("run_async",
+             [](migraphx::program& p,
+                py::dict params,
+                std::uintptr_t stream,
+                std::string stream_name) {
+                 migraphx::parameter_map pm;
+                 for(auto x : params)
+                 {
+                     std::string key      = x.first.cast<std::string>();
+                     py::buffer b         = x.second.cast<py::buffer>();
+                     py::buffer_info info = b.request();
+                     pm[key]              = migraphx::argument(to_shape(info), info.ptr);
+                 }
+                 migraphx::execution_environment exec_env{
+                     migraphx::any_ptr(reinterpret_cast<void*>(stream), stream_name), true};
+                 return p.eval(pm, exec_env);
+             })
        .def("sort", &migraphx::program::sort)
        .def("print", [](const migraphx::program& p) { std::cout << p << std::endl; })
        .def("__eq__", std::equal_to<migraphx::program>{})

--- a/src/replace_allocate.cpp
+++ b/src/replace_allocate.cpp
@@ -73,7 +73,7 @@ void insert_submod_allocations(instruction_ref ins, module& mod, const allocatio
        name_shapes.insert(ps.begin(), ps.end());
    }

-    for(auto& pn : name_shapes)
+    for(const auto& pn : name_shapes)
    {
        const auto& s = pn.second;
        instruction_ref output{};

--- a/src/rewrite_batchnorm.cpp
+++ b/src/rewrite_batchnorm.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/rewrite_batchnorm.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
-#include <migraphx/op/broadcast.hpp>
-#include <migraphx/op/add.hpp>
-#include <migraphx/op/mul.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/make_op.hpp>
-
-#include <migraphx/dfor.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-
-void rewrite_batchnorm::apply(module& m) const
-{
-    for(auto ins : iterator_for(m))
-    {
-        if(ins->name() != "batch_norm_inference")
-            continue;
-        // Get scale, bias, mean, variance from inputs
-        auto gamma    = ins->inputs()[1]->eval();
-        auto bias     = ins->inputs()[2]->eval();
-        auto mean     = ins->inputs()[3]->eval();
-        auto variance = ins->inputs()[4]->eval();
-        if(any_of({gamma, bias, mean, variance}, [](auto arg) { return arg.empty(); }))
-            continue;
-
-        std::vector<std::size_t> lens = ins->inputs()[1]->get_shape().lens();
-        shape s{ins->get_shape().type(), lens};
-        // Get epsilon
-        auto bn_op   = any_cast<op::batch_norm_inference>(ins->get_operator());
-        auto epsilon = bn_op.epsilon;
-
-        argument a{s};
-        argument b{s};
-        visit_all(gamma, bias, mean, variance, a, b)(
-            [&](auto gamma2, auto bias2, auto mean2, auto variance2, auto a2, auto b2) {
-                dfor(a.get_shape().elements())(
-                    [&](std::size_t c) { a2[c] = gamma2[c] / std::sqrt(variance2[c] + epsilon); });
-                dfor(b.get_shape().elements())([&](std::size_t c) {
-                    b2[c] = bias2[c] - (gamma2[c] * mean2[c] / std::sqrt(variance2[c] + epsilon));
-                });
-            });
-
-        auto broadcast   = op::broadcast{1, ins->get_shape().lens()};
-        auto a_ins       = m.add_literal({a.get_shape(), a.data()});
-        auto a_broadcast = m.insert_instruction(ins, broadcast, a_ins);
-        auto mul   = m.insert_instruction(ins, make_op("mul"), ins->inputs().front(), a_broadcast);
-        auto b_ins = m.add_literal({b.get_shape(), b.data()});
-        auto b_broadcast = m.insert_instruction(ins, broadcast, b_ins);
-        auto add         = m.insert_instruction(ins, make_op("add"), mul, b_broadcast);
-        m.replace_instruction(ins, add);
-    }
-}
-
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -57,12 +57,14 @@ auto conv_const_weights()

 auto reduction() { return match::name_contains("reduce"); }

+// conv(x, w) * a => conv(x, a * w)
 struct find_mul_conv
 {
    auto matcher() const
    {
-        return match::name("mul")(match::either_arg(0, 1)(conv_const_weights().bind("conv"),
-                                                          match::name("broadcast").bind("a")));
+        return match::name("mul")(
+            match::either_arg(0, 1)(conv_const_weights().bind("conv"),
+                                    match::name("broadcast", "multibroadcast").bind("a")));
    }

    void apply(module& m, const match::matcher_result& r) const
@@ -72,14 +74,35 @@ struct find_mul_conv
        auto a_ins    = r.instructions["a"];
        auto w_ins    = r.instructions["w"];

-        auto broadcast_op = any_cast<op::broadcast>(a_ins->get_operator());
-        if(broadcast_op.axis != 1)
+        const auto& a_input_lens = a_ins->inputs().front()->get_shape().lens();
+
+        std::size_t num_not_one_dims = std::count_if(
+            a_input_lens.cbegin(), a_input_lens.cend(), [](auto dim) { return dim != 1; });
+        if(num_not_one_dims > 1)
+            return;
+
+        // check broadcasted along channels
+        const auto& a_lens    = a_ins->get_shape().lens();
+        const auto& a_strides = a_ins->get_shape().strides();
+
+        auto is_broadcasted_axis = [](auto len, auto stride) { return len == 1 or stride == 0; };
+
+        if(a_strides.at(1) != 1)
+            return;
+
+        if(not is_broadcasted_axis(a_lens.front(), a_strides.front()))
            return;

+        if(not std::equal(a_lens.begin() + 2,
+                          a_lens.end(),
+                          a_strides.begin() + 2,
+                          a_strides.end(),
+                          is_broadcasted_axis))
+            return;
+
+        auto sq    = m.insert_instruction(ins, make_op("squeeze"), a_ins->inputs().front());
        auto new_a = m.insert_instruction(
-            ins,
-            make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}),
-            a_ins->inputs().front());
+            ins, make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}), sq);
        auto new_mul  = m.insert_instruction(ins, make_op("mul"), new_a, w_ins);
        auto new_conv = m.insert_instruction(
            ins, conv_ins->get_operator(), conv_ins->inputs().front(), new_mul);
@@ -412,6 +435,24 @@ struct find_concat_op
    }
 };

+void move_instructions_back(module& m, instruction_ref pos, std::vector<instruction_ref> inss)
+{
+    auto start = range(m.begin(), pos);
+    for(auto ins : iterator_for(start))
+    {
+        auto it = std::find(inss.begin(), inss.end(), ins);
+        if(it != inss.end())
+            inss.erase(it);
+    }
+    for(auto ins : inss)
+    {
+        if(not m.has_instruction(ins))
+            continue;
+        move_instructions_back(m, pos, ins->inputs());
+        m.move_instruction(ins, pos);
+    }
+}
+
 std::vector<instruction_ref> get_splits(instruction_ref ins)
 {
    std::vector<instruction_ref> result;
@@ -587,8 +628,7 @@ struct find_splits
                   }))
                    return;

-                for(auto data : data_args)
-                    m.move_instructions(data, ins);
+                move_instructions_back(m, ins, data_args);

                auto slice_op = any_cast<op::slice>(splits.front()->get_operator());
                assert(not slice_op.axes.empty());
@@ -841,8 +881,7 @@ struct find_conv_dot_horiz_fusion
                concat_axis = axis;
            }

-            for(auto arg : args)
-                m.move_instructions(arg, input);
+            move_instructions_back(m, input, args);
            // TODO: Check if axes match
            auto concat =
                m.insert_instruction(input, make_op("concat", {{"axis", concat_axis}}), args);
@@ -894,6 +933,73 @@ struct find_div_const
    }
 };

+struct find_unit_ops
+{
+    auto matcher() const
+    {
+        auto mul_1 = match::name("mul")(
+            match::either_arg(0, 1)(match::has_value(1.0f), match::any().bind("x")));
+        auto div_1 =
+            match::name("div")(match::args(match::any().bind("x"), match::has_value(1.0f)));
+        auto add_0 = match::name("add")(
+            match::either_arg(0, 1)(match::has_value(0.0f, 1e-12), match::any().bind("x")));
+        auto sub_0 =
+            match::name("sub")(match::args(match::any().bind("x"), match::has_value(0.0f)));
+        return match::any_of(mul_1, div_1, add_0, sub_0);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins  = r.result;
+        auto c_in = r.instructions["x"];
+
+        m.replace_instruction(ins, c_in);
+    }
+};
+
+struct find_neg_unit_ops
+{
+    auto matcher() const
+    {
+        auto mul_neg_1 = match::name("mul")(
+            match::either_arg(0, 1)(match::has_value(-1.0f), match::any().bind("x")));
+        auto div_neg_1 =
+            match::name("div")(match::args(match::any().bind("x"), match::has_value(-1.0f)));
+        auto sub_0 =
+            match::name("sub")(match::args(match::has_value(0.0f), match::any().bind("x")));
+        return match::any_of(mul_neg_1, div_neg_1, sub_0);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins  = r.result;
+        auto c_in = r.instructions["x"];
+
+        auto neg = m.add_instruction(make_op("neg"), c_in);
+        m.replace_instruction(ins, neg);
+    }
+};
+
+struct find_zero_ops
+{
+    auto matcher() const
+    {
+        auto mul_zero = match::name("mul")(
+            match::either_arg(0, 1)(match::has_value(0.0f).bind("x"), match::any()));
+        auto div_zero =
+            match::name("div")(match::args(match::has_value(0.0f).bind("x"), match::any()));
+        return match::any_of(mul_zero, div_zero);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto zero_ins = r.instructions["x"];
+
+        m.replace_instruction(ins, zero_ins);
+    }
+};
+
 struct find_sub_const
 {
    auto matcher() const
@@ -985,20 +1091,35 @@ struct find_split_reshape
        auto rsp_lens    = rsp->get_shape().lens();
        auto rsp_strides = rsp->get_shape().strides();
        rsp_strides.insert(rsp_strides.begin(), rsp_strides[0] * rsp_lens[0]);
-        auto ait = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+
+        auto ait     = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+        int rsp_axis = -1;
        if(ait == rsp_strides.end())
        {
            return;
        }
-        int rsp_axis = std::distance(rsp_strides.begin(), ait);
-
+        else if(ait == rsp_strides.end() - 1)
+        {
+            // edge case
+            // slice_dim == 1, in that case it could match with last stride of 1.
+            // it should accumulate lengths from last dim in that case. discount 1 to avoid going
+            // out of bounds.
+            assert(slc_dim_size == 1);
+            rsp_axis = std::distance(rsp_strides.begin(), ait) - 1;
+        }
+        else
+        {
+            rsp_axis = std::distance(rsp_strides.begin(), ait);
+        }
        // calculate reshape output shape
        std::vector<int64_t> vec_dims(vec_rsp.size());
+
        std::transform(vec_rsp.begin(), vec_rsp.end(), vec_dims.begin(), [&](auto is) {
            return is->get_shape().lens()[rsp_axis];
        });

        std::vector<int64_t> rsp_out_lens(rsp_lens.begin(), rsp_lens.end());
+
        rsp_out_lens[rsp_axis] = std::accumulate(vec_dims.begin(), vec_dims.end(), std::int64_t{0});

        // insert the reshape instruction and add contiguous if needed
@@ -1095,6 +1216,9 @@ void simplify_algebra::apply(module& m) const
                            find_mul_conv{},
                            find_mul_slice_conv{},
                            find_mul_add{},
+                            find_unit_ops{},
+                            find_neg_unit_ops{},
+                            find_zero_ops{},
                            find_dot_add{},
                            find_div_const{},
                            find_sub_const{},

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -271,6 +271,44 @@ struct find_nested_slice
    }
 };

+struct find_concat_multibroadcasts
+{
+    auto matcher() const
+    {
+        return match::name("concat")(match::all_of[match::inputs()](match::name("multibroadcast")));
+    }
+
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins        = mr.result;
+        auto op         = any_cast<op::concat>(ins->get_operator());
+        auto out_lens   = ins->get_shape().lens();
+        auto inputs     = ins->inputs();
+        auto in_strides = inputs.front()->get_shape().strides();
+
+        // Only apply when concat axis is not a broadcasted dimension
+        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
+               return i->get_shape().strides()[op.axis] == 0;
+           }))
+        {
+            return;
+        }
+
+        // Use inputs of multibroadcast ops as inputs to new concat op
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [](auto i) {
+            return i->inputs().front();
+        });
+
+        // Reduce axis by number of leading broadcasted dimensions
+        if(inputs.front()->get_shape().lens().size() < out_lens.size())
+            op.axis -= std::count(in_strides.begin(), in_strides.begin() + op.axis, 0);
+
+        auto concat = m.insert_instruction(ins, op, inputs);
+        m.replace_instruction(
+            ins, migraphx::make_op("multibroadcast", {{"out_lens", out_lens}}), concat);
+    }
+};
+
 struct find_concat_transpose
 {
    auto matcher() const
@@ -764,6 +802,7 @@ void simplify_reshapes::apply(module& m) const
                            find_reshaper{},
                            find_transpose{},
                            find_concat_transpose{},
+                            find_concat_multibroadcasts{},
                            find_nested_convert{},
                            find_nested_slice{},
                            find_nested_concat{},

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(migraphx_cpu
    dnnl.cpp
    eltwise.cpp
    erf.cpp
+    fmod.cpp
    fuse_ops.cpp
    gather.cpp
    gemm.cpp
@@ -42,6 +43,7 @@ add_library(migraphx_cpu
    logsoftmax.cpp
    lowering.cpp
    lrn.cpp
+    mod.cpp
    preallocate.cpp
    pooling.cpp
    reduction.cpp

--- a/src/targets/gpu/include/migraphx/gpu/cos.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/cos.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_COS_HPP
-#define MIGRAPHX_GUARD_RTGLIB_COS_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/cos.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/fmod.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {

-struct hip_cos : unary_device<hip_cos, device::cos>
-{
-};
+template struct cpu_binary<op::fmod>;

-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/identity.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/quant_convolution.hpp>
@@ -43,6 +42,8 @@
 #include <migraphx/op/argmax.hpp>
 #include <migraphx/op/argmin.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/op/mod.hpp>
+#include <migraphx/op/fmod.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>

--- a/src/targets/gpu/include/migraphx/gpu/exp.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/exp.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_EXP_HPP
-#define MIGRAPHX_GUARD_RTGLIB_EXP_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/exp.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/mod.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {

-struct hip_exp : unary_device<hip_exp, device::exp>
-{
-};
+template struct cpu_binary<op::mod>;

-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -37,7 +37,6 @@
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/replace_allocate.hpp>
-#include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/rewrite_rnn.hpp>
@@ -78,8 +77,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            eliminate_identity{},
            eliminate_pad{},
            dead_code_elimination{},
-            rewrite_batchnorm{},
-            dead_code_elimination{},
            rewrite_rnn{},
            dead_code_elimination{},
            eliminate_common_subexpression{},

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -39,81 +39,9 @@ file(GLOB KERNEL_FILES ${CONFIGURE_DEPENDS}
 message(STATUS "KERNEL_FILES: ${KERNEL_FILES}")
 add_embed_library(migraphx_kernels ${KERNEL_FILES})

-add_library(migraphx_device
-    device/acos.cpp
-    device/acosh.cpp
-    device/add.cpp
-    device/add_clip.cpp
-    device/add_relu.cpp
-    device/add_sigmoid.cpp
-    device/add_tanh.cpp
-    device/argmax.cpp
-    device/argmin.cpp
-    device/asin.cpp
-    device/asinh.cpp
-    device/atan.cpp
-    device/atanh.cpp
-    device/ceil.cpp
-    device/clip.cpp
-    device/concat.cpp
-    device/contiguous.cpp
-    device/convert.cpp
-    device/cos.cpp
-    device/cosh.cpp
-    device/div.cpp
-    device/equal.cpp
-    device/erf.cpp
-    device/exp.cpp
-    device/fill.cpp
-    device/floor.cpp
-    device/gather.cpp
-    device/gelu.cpp
-    device/greater.cpp
-    device/int8_gemm_pack.cpp
-    device/layernorm.cpp
-    device/less.cpp
-    device/log.cpp
-    device/logical_and.cpp
-    device/logical_or.cpp
-    device/logical_xor.cpp
-    device/logsoftmax.cpp
-    device/max.cpp
-    device/min.cpp
-    device/mul.cpp
-    device/mul_add.cpp
-    device/mul_add_relu.cpp
-    device/multinomial.cpp
-    device/nonzero.cpp
-    device/pad.cpp
-    device/pow.cpp
-    device/prelu.cpp
-    device/prefix_scan_sum.cpp
-    device/recip.cpp
-    device/reduce_max.cpp
-    device/reduce_mean.cpp
-    device/reduce_min.cpp
-    device/reduce_sum.cpp
-    device/reduce_prod.cpp
-    device/relu.cpp
-    device/reverse.cpp
-    device/rnn_variable_seq_lens.cpp
-    device/round.cpp
-    device/rsqrt.cpp
-    device/scatter.cpp
-    device/sigmoid.cpp
-    device/sign.cpp
-    device/sin.cpp
-    device/sinh.cpp
-    device/softmax.cpp
-    device/sqdiff.cpp
-    device/sqrt.cpp
-    device/sub.cpp
-    device/tan.cpp
-    device/tanh.cpp
-    device/topk.cpp
-    device/unary_not.cpp
-    device/where.cpp
-)
+file(GLOB DEVICE_GPU_SRCS ${CONFIGURE_DEPENDS} ${CMAKE_CURRENT_SOURCE_DIR}/device/*.cpp)
+add_library(migraphx_device ${DEVICE_GPU_SRCS})
+
 add_library(compile_for_gpu INTERFACE)
 target_compile_options(compile_for_gpu INTERFACE -std=c++17 -fno-gpu-rdc -Wno-cuda-compat -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
 target_link_libraries(compile_for_gpu INTERFACE hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-option-ignored)
@@ -150,18 +78,12 @@ add_library(migraphx_gpu
    allocation_model.cpp
    argmax.cpp
    argmin.cpp
-    batch_norm_inference.cpp
-    clip.cpp
    code_object_op.cpp
    compile_ops.cpp
    compile_gen.cpp
    compile_hip.cpp
    compile_hip_code_object.cpp
    compiler.cpp
-    concat.cpp
-    convert.cpp
-    convolution.cpp
-    deconvolution.cpp
    device_name.cpp
    elu.cpp
    fuse_mlir.cpp
@@ -186,13 +108,11 @@ add_library(migraphx_gpu
    pad.cpp
    perfdb.cpp
    pooling.cpp
-    quant_convolution.cpp
    reverse.cpp
    rnn_variable_seq_lens.cpp
    rocblas.cpp
    scatter.cpp
    schedule_model.cpp
-    softmax.cpp
    sync_device.cpp
    target.cpp
    topk.cpp
@@ -207,81 +127,27 @@ function(register_migraphx_gpu_ops PREFIX)
    endforeach()
 endfunction()
 register_migraphx_gpu_ops(hip_
-    acosh
-    acos
-    add
    argmax
    argmin
-    asinh
-    asin
-    atanh
-    atan
-    ceil
-    clip
-    concat
-    convert
-    cosh
-    cos
-    div
-    equal
-    erf
-    exp
-    floor
    gather
-    greater
-    less
-    log
    logsoftmax
-    logical_and
-    logical_or
-    logical_xor
    loop
-    max
-    min
-    mul
    multinomial
    nonzero
    pad
-    pow
-    prelu
    prefix_scan_sum
-    recip
-    reduce_max
-    reduce_mean
-    reduce_min
-    reduce_prod
-    reduce_sum
-    relu
    reverse
-    round
-    rsqrt
    scatter
-    sigmoid
-    sign
-    sinh
-    sin
-    softmax
-    sqdiff
-    sqrt
-    sub
-    tanh
-    tan
    topk
-    unary_not
-    where
 )
 register_migraphx_gpu_ops(miopen_
    abs
-    batch_norm_inference
    contiguous
-    convolution
-    deconvolution
    elu
    int8_conv_pack
    leaky_relu
    lrn
    pooling
-    quant_convolution
 )
 register_op(migraphx_gpu 
    HEADER migraphx/gpu/rnn_variable_seq_lens.hpp 
@@ -295,6 +161,9 @@ register_op(migraphx_gpu
    HEADER migraphx/gpu/gemm.hpp 
    OPERATORS gpu::rocblas_gemm<op::dot> gpu::rocblas_gemm<op::quant_dot>
    INCLUDES migraphx/gpu/context.hpp)
+register_op(migraphx_gpu HEADER migraphx/gpu/convolution.hpp 
+    OPERATORS gpu::miopen_convolution<op::convolution> gpu::miopen_convolution<op::deconvolution> gpu::miopen_convolution<op::quant_convolution>
+    INCLUDES migraphx/gpu/context.hpp)
 rocm_set_soversion(migraphx_gpu ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_gpu)

@@ -322,26 +191,11 @@ message(STATUS "extractkernel: ${MIGRAPHX_EXTRACT_KERNEL}")

 set(MIGRAPHX_ENABLE_MLIR OFF CACHE BOOL "")
 if(MIGRAPHX_ENABLE_MLIR)
-    find_library(MLIRAPI_LIBRARY MLIRMIOpen 
-        PATH_SUFFIXES
-        # Workaournd broken mlir install
-        lib/ lib/lib)
-    # REQUIRED is not supported before cmake 3.18
-    if(NOT MLIRAPI_LIBRARY)
-        message(FATAL_ERROR "libMLIRMIOpen not found")
-    else()
-        message(STATUS "Build with libMLIRMIOpen: " ${MLIRAPI_LIBRARY})
-    endif()
-
-    find_path(MLIRAPI_HEADERS NAMES mlir-c/Dialect/MIGraphX.h)
-    # Workaround MLIR broken installation
-    find_path(MLIRAPI_HEADERS2 NAMES mlir-c/Registration.h
-        PATH_SUFFIXES 
-        include/external/include external/include)
-
+    # Find package rocMLIR
+    find_package(rocMLIR 1.0.0 CONFIG REQUIRED)
+    message(STATUS "Build with rocMLIR::rockCompiler ${rocMLIR_VERSION}")
    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR")
-    target_include_directories(migraphx_gpu SYSTEM PRIVATE ${MLIRAPI_HEADERS} ${MLIRAPI_HEADERS2})
-    target_link_libraries(migraphx_gpu PUBLIC ${MLIRAPI_LIBRARY})
+    target_link_libraries(migraphx_gpu PUBLIC rocMLIR::rockCompiler)
 endif()

 set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "")
@@ -380,9 +234,18 @@ endif()
 include(CheckLibraryExists)
 get_target_property(MIOPEN_LOCATION MIOpen LOCATION)
 check_library_exists(MIOpen "miopenHiddenSetConvolutionFindMode" "${MIOPEN_LOCATION}" HAS_FIND_MODE_API)
+check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_2_API)
+
+if(HAS_FIND_2_API) 
+    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
+else()
+    message(STATUS "MIOpen does not have Find-2.0 API")
+endif()
+
 if(HAS_FIND_MODE_API)
    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_MODE_API)
-    message(STATUS "MIOpen has find mode api")
+    message(STATUS "MIGraphx is using Find Mode API of MIOpen")
 else()
    message(STATUS "MIOpen does not have find mode api")
 endif()

--- a/src/targets/gpu/batch_norm_inference.cpp
+++ b/src/targets/gpu/batch_norm_inference.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/batch_norm_inference.hpp>
-#include <migraphx/gpu/context.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape miopen_batch_norm_inference::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(6);
-    check_shapes{inputs.data(), inputs.data() + 1, *this}.same_ndims().max_ndims(5);
-    return op.compute_shape({inputs.at(0), inputs.at(1), inputs.at(2), inputs.at(3), inputs.at(4)});
-}
-
-inline shape reshape_to_2d(const shape& input)
-{
-    auto dims = input.lens();
-    if(dims.size() >= 4)
-        return input;
-
-    std::vector<size_t> new_dims(dims.begin(), dims.end());
-    std::size_t num = 4 - dims.size();
-    new_dims.insert(new_dims.end(), num, 1);
-    return {input.type(), new_dims};
-}
-
-argument miopen_batch_norm_inference::compute(context& ctx,
-                                              const shape& output_shape,
-                                              const std::vector<argument>& args) const
-{
-    shape x_shape  = args[0].get_shape();
-    shape y_shape  = output_shape;
-    shape bn_shape = args[3].get_shape();
-
-    auto x_desc  = make_tensor(reshape_to_2d(x_shape));
-    auto y_desc  = make_tensor(reshape_to_2d(y_shape));
-    auto bn_desc = make_tensor(reshape_to_2d(bn_shape));
-
-    float alpha = 1.0;
-    float beta  = 0.0f;
-
-    miopenBatchNormalizationForwardInference(ctx.get_stream().get_miopen(),
-                                             miopenBatchNormMode_t(op.bn_mode),
-                                             &alpha,
-                                             &beta,
-                                             x_desc.get(),
-                                             args[0].implicit(),
-                                             y_desc.get(),
-                                             args[5].implicit(),
-                                             bn_desc.get(),
-                                             args[1].implicit(),
-                                             args[2].implicit(),
-                                             args[3].implicit(),
-                                             args[4].implicit(),
-                                             op.epsilon);
-
-    return args[5];
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/clip.cpp
+++ b/src/targets/gpu/clip.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/clip.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/device/clip.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape hip_clip::compute_shape(std::vector<shape> inputs) const
-{
-    inputs.pop_back();
-    return op.compute_shape(inputs);
-}
-
-argument hip_clip::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    device::clip(ctx.get_stream().get(), args.back(), args.front(), args.at(1), args.at(2));
-    return args.back();
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -22,6 +22,7 @@
 * THE SOFTWARE.
 */
 #include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/gpu/context.hpp>
 #include <migraphx/shape.hpp>
 #include <migraphx/permutation.hpp>
 #include <migraphx/stringutils.hpp>
@@ -48,12 +49,13 @@ static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)
    return {4, 2};
 }

-vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+vectorize vectorize::elements(std::size_t axis,
+                              const std::vector<shape>& inputs,
+                              const std::vector<std::size_t>& sizes)
 {
    if(std::all_of(
           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
        return {1, axis};
-    auto sizes = vector_sizes(inputs);
    std::vector<std::size_t> max_vec_size;
    std::transform(inputs.begin(),
                   inputs.end(),
@@ -81,6 +83,33 @@ vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs
    return {*std::min_element(max_vec_size.begin(), max_vec_size.end()), axis};
 }

+vectorize vectorize::elements(context& ctx, std::size_t axis, const std::vector<shape>& inputs)
+{
+    if(inputs.empty())
+        return {1, axis};
+    std::size_t n = std::max_element(inputs.begin(),
+                                     inputs.end(),
+                                     by(std::less<>{}, [](const auto& s) { return s.elements(); }))
+                        ->elements();
+    std::size_t max_global = ctx.get_current_device().get_cu_count() *
+                             ctx.get_current_device().get_max_workitems_per_cu();
+    std::size_t over = n / max_global;
+    bool broadcasted =
+        std::any_of(inputs.begin(), inputs.end(), [](const auto& s) { return s.broadcasted(); });
+    std::vector<std::size_t> sizes;
+    if(broadcasted and over > 8)
+        sizes.push_back(8);
+    if(over > 4)
+        sizes.push_back(4);
+    sizes.push_back(2);
+    return elements(axis, inputs, sizes);
+}
+
+vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
+{
+    return elements(axis, inputs, vector_sizes(inputs));
+}
+
 std::string vectorize::str() const
 {
    return "vectorize<" + to_string(size) + ", " + to_string(axis) + ">()";
@@ -102,7 +131,7 @@ preload preload::broadcasts(std::size_t axis, const std::vector<shape>& inputs)
    std::size_t bytes = 0;
    for(auto i : preloaded)
    {
-        auto input = inputs[i];
+        const auto& input = inputs[i];
        bytes += input.bytes();
        if(bytes > max_lds_bytes)
            break;

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
--- a/src/targets/gpu/concat.cpp
+++ b/src/targets/gpu/concat.cpp
--- a/src/targets/gpu/convert.cpp
+++ b/src/targets/gpu/convert.cpp