Merge branch 'dyn_squeeze' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_model_test

31065c7d · charlie · 6bec381f · 6acbd4e4 · 31065c7d · 31065c7d
Commit 31065c7d authored Oct 31, 2022 by charlie
20 changed files
--- a/src/pad_calc.cpp
+++ b/src/pad_calc.cpp
@@ -52,19 +52,21 @@ void calculate_padding(int64_t idx,
    }
 }

-std::vector<std::size_t> calc_dyn_auto_pad(std::vector<std::size_t> tensor_lens,
-                                           std::vector<std::size_t> k_lens,
-                                           std::vector<std::size_t> strides,
-                                           std::vector<std::size_t> dilations,
+std::vector<std::size_t> calc_dyn_auto_pad(const std::vector<std::size_t>& input_lens,
+                                           const std::vector<std::size_t>& wei_lens,
+                                           const std::vector<std::size_t>& strides,
+                                           const std::vector<std::size_t>& dilations,
                                           bool use_upper)
 {
    std::vector<std::size_t> padding;
-    padding.resize(2 * k_lens.size());
-    for(size_t i = 0; i < padding.size() / 2; i++)
+    assert(input_lens.size() >= 3);
+    std::size_t num_spatial_dims = input_lens.size() - 2;
+    padding.resize(2 * num_spatial_dims);
+    for(std::size_t i = 0; i < num_spatial_dims; i++)
    {
-        std::ptrdiff_t input_dim      = tensor_lens[i];
+        std::ptrdiff_t input_dim      = input_lens[i + 2];
        std::ptrdiff_t stride         = strides[i];
-        std::ptrdiff_t weight_dim     = k_lens[i];
+        std::ptrdiff_t weight_dim     = wei_lens[i + 2];
        std::ptrdiff_t dilation       = dilations[i];
        std::ptrdiff_t output_dim     = (input_dim + stride - 1) / stride; // round up result
        std::ptrdiff_t new_weight_dim = weight_dim + (weight_dim - 1) * (dilation - 1);
@@ -86,5 +88,28 @@ std::vector<std::size_t> calc_dyn_auto_pad(std::vector<std::size_t> tensor_lens,
    return padding;
 }

+shape compute_padded_shape(const shape& input,
+                           const shape& weights,
+                           const std::vector<std::size_t>& padding,
+                           const std::vector<std::size_t>& stride,
+                           const std::vector<std::size_t>& dilation)
+{
+    const size_t num_spatial_dims = input.lens().size() - 2;
+
+    std::vector<size_t> output_lens{input.lens()[0], weights.lens()[0]};
+    // calculate the output shape of the convolution: ((W - K + 2P) / S) + 1
+    for(size_t i = 0; i < num_spatial_dims; ++i)
+    {
+        auto padding_factor = padding[i] + padding[i + num_spatial_dims];
+        output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(
+            1,
+            (input.lens()[i + 2] - (1 + dilation[i] * (weights.lens()[i + 2] - 1)) +
+             padding_factor) /
+                    stride[i] +
+                1)));
+    }
+    return input.with_lens(output_lens);
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/process.cpp
+++ b/src/process.cpp
@@ -50,7 +50,7 @@ int exec(const std::string& cmd, const std::function<void(const char*)>& std_out
    {
        // TODO: Use execve instead of popen
        std::unique_ptr<FILE, decltype(closer)> pipe(popen(cmd.c_str(), "r"), closer); // NOLINT
-        if(!pipe)
+        if(not pipe)
            MIGRAPHX_THROW("popen() failed: " + cmd);
        std::array<char, 128> buffer;
        while(fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr)

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -37,6 +37,7 @@
 #include <migraphx/output_iterator.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/marker.hpp>
+#include <migraphx/supported_segments.hpp>
 #include <iostream>
 #include <sstream>
 #include <algorithm>
@@ -77,11 +78,11 @@ program& program::operator=(program p)

 void program::assign(const program& p)
 {
-    if(!impl)
+    if(not impl)
    {
        impl = std::make_unique<program_impl>();
    }
-    else if(!impl->modules.empty())
+    else if(not impl->modules.empty())
    {
        impl->modules.clear();
    }
@@ -167,13 +168,37 @@ target_assignments program::get_target_assignments(const std::vector<target>& ta
    target_assignments p;

    const auto* mod = get_main_module();
-    for(auto it : iterator_for(*mod))
+    std::vector<std::pair<target, supported_segments>> target_subgraphs;
+    target_subgraphs.reserve(targets.size());
+    std::transform(targets.begin(),
+                   targets.end(),
+                   std::back_inserter(target_subgraphs),
+                   [&](const auto& t) { return std::make_pair(t, t.find_supported(mod, m)); });
+
+    for(const auto ins : iterator_for(*mod))
    {
-        auto t = std::max_element(
-            targets.begin(), targets.end(), [it, m](const target& lhs, const target& rhs) {
-                return lhs.is_supported(it, m) < rhs.is_supported(it, m);
-            });
-        p.add_assignment(it, t->name());
+        if(contains(p, ins))
+        {
+            continue;
+        }
+
+        for(const auto& [target, subgraph] : target_subgraphs)
+        {
+            // can't pass a structured binding into lambda in C++17 so create a variable for it
+            const auto& t = target;
+            for(const auto& segment : subgraph)
+            {
+                const auto& instructions = segment.instructions;
+                if(not contains(instructions, ins))
+                {
+                    continue;
+                }
+                std::transform(instructions.begin(),
+                               instructions.end(),
+                               std::inserter(p, p.end()),
+                               [&](auto instr) { return std::make_pair(instr, t.name()); });
+            }
+        }
    }
    return p;
 }
@@ -373,7 +398,7 @@ std::vector<argument> generic_eval(const program& p,
    return generic_eval(mm, ctx, params, {}, make_trace);
 }

-std::vector<argument> program::eval(parameter_map params) const
+std::vector<argument> program::eval(parameter_map params, execution_environment exec_env) const
 {
    auto& ctx = this->impl->ctx;
 #ifndef NDEBUG
@@ -398,6 +423,12 @@ std::vector<argument> program::eval(parameter_map params) const
 #endif

    auto trace_level = value_of(MIGRAPHX_TRACE_EVAL{});
+    std::vector<argument> ret;
+
+    if(exec_env.async)
+    {
+        ctx.wait_for(exec_env.queue);
+    }

    if(trace_level > 0)
    {
@@ -409,49 +440,56 @@ std::vector<argument> program::eval(parameter_map params) const
            ins_out[x] = ss.str();
        });

-        return generic_eval(*this,
-                            ctx,
-                            std::move(params),
-                            with_check_context([&](auto& ins, auto f, auto&& check_context) {
-                                ctx.finish();
-                                std::cout << "Run instruction: " << ins_out.at(ins) << std::endl;
-                                timer t{};
-                                auto result = check_context(f);
-                                double t1   = t.record<milliseconds>();
-                                ctx.finish();
-                                double t2 = t.record<milliseconds>();
-                                std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
-                                if(trace_level > 1 and ins->name().front() != '@' and
-                                   ins->name() != "load" and not result.empty())
-                                {
-                                    target tgt  = make_target(this->impl->target_name);
-                                    auto buffer = tgt.copy_from(result);
-                                    if(trace_level == 2)
-                                    {
-                                        std::cout << "Output has "
-                                                  << to_string_range(classify_argument(buffer))
-                                                  << std::endl;
-                                        std::cout << "Output: ";
-                                        preview_argument(std::cout, buffer);
-                                        std::cout << std::endl;
-                                    }
-                                    else
-                                    {
-                                        std::cout << "Output: " << buffer << std::endl;
-                                    }
-                                }
-                                return result;
-                            }));
+        ret = generic_eval(*this,
+                           ctx,
+                           std::move(params),
+                           with_check_context([&](auto& ins, auto f, auto&& check_context) {
+                               ctx.finish();
+                               std::cout << "Run instruction: " << ins_out.at(ins) << std::endl;
+                               timer t{};
+                               auto result = check_context(f);
+                               double t1   = t.record<milliseconds>();
+                               ctx.finish();
+                               double t2 = t.record<milliseconds>();
+                               std::cout << "Time: " << t1 << "ms, " << t2 << "ms" << std::endl;
+                               if(trace_level > 1 and ins->name().front() != '@' and
+                                  ins->name() != "load" and not result.empty())
+                               {
+                                   target tgt  = make_target(this->impl->target_name);
+                                   auto buffer = tgt.copy_from(result);
+                                   if(trace_level == 2)
+                                   {
+                                       std::cout << "Output has "
+                                                 << to_string_range(classify_argument(buffer))
+                                                 << std::endl;
+                                       std::cout << "Output: ";
+                                       preview_argument(std::cout, buffer);
+                                       std::cout << std::endl;
+                                   }
+                                   else
+                                   {
+                                       std::cout << "Output: " << buffer << std::endl;
+                                   }
+                               }
+                               return result;
+                           }));
    }
    else
    {
-        return generic_eval(*this,
-                            ctx,
-                            std::move(params),
-                            with_check_context([&](auto&, auto f, auto&& check_context) {
-                                return check_context(f);
-                            }));
+        ret = generic_eval(*this,
+                           ctx,
+                           std::move(params),
+                           with_check_context([&](auto&, auto f, auto&& check_context) {
+                               return check_context(f);
+                           }));
    }
+
+    if(exec_env.async)
+    {
+        ctx.finish_on(exec_env.queue);
+    }
+
+    return ret;
 }

 const int program_file_version = 5;

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -40,6 +40,7 @@
 #include <migraphx/register_target.hpp>
 #include <migraphx/json.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/op/common.hpp>

 #ifdef HAVE_GPU
 #include <migraphx/gpu/hip.hpp>
@@ -82,7 +83,7 @@ void visit_py(T x, F f)
    {
        f(x.template cast<bool>());
    }
-    else if(py::isinstance<py::int_>(x))
+    else if(py::isinstance<py::int_>(x) or py::hasattr(x, "__index__"))
    {
        f(x.template cast<int>());
    }
@@ -263,12 +264,13 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)

    py::class_<migraphx::argument>(m, "argument", py::buffer_protocol())
        .def_buffer([](migraphx::argument& x) -> py::buffer_info { return to_buffer_info(x); })
-        .def("__init__",
-             [](migraphx::argument& x, py::buffer b) {
-                 py::buffer_info info = b.request();
-                 new(&x) migraphx::argument(to_shape(info), info.ptr);
-             })
+        .def(py::init([](py::buffer b) {
+            py::buffer_info info = b.request();
+            return migraphx::argument(to_shape(info), info.ptr);
+        }))
        .def("get_shape", &migraphx::argument::get_shape)
+        .def("data_ptr",
+             [](migraphx::argument& x) { return reinterpret_cast<std::uintptr_t>(x.data()); })
        .def("tolist",
             [](migraphx::argument& x) {
                 py::list l{x.get_shape().elements()};
@@ -324,6 +326,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
        .def("get_parameter_names", &migraphx::program::get_parameter_names)
        .def("get_parameter_shapes", &migraphx::program::get_parameter_shapes)
        .def("get_output_shapes", &migraphx::program::get_output_shapes)
+        .def("is_compiled", &migraphx::program::is_compiled)
        .def(
            "compile",
            [](migraphx::program& p, const migraphx::target& t, bool offload_copy, bool fast_math) {
@@ -352,24 +355,58 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
                 }
                 return p.eval(pm);
             })
+        .def("run_async",
+             [](migraphx::program& p,
+                py::dict params,
+                std::uintptr_t stream,
+                std::string stream_name) {
+                 migraphx::parameter_map pm;
+                 for(auto x : params)
+                 {
+                     std::string key      = x.first.cast<std::string>();
+                     py::buffer b         = x.second.cast<py::buffer>();
+                     py::buffer_info info = b.request();
+                     pm[key]              = migraphx::argument(to_shape(info), info.ptr);
+                 }
+                 migraphx::execution_environment exec_env{
+                     migraphx::any_ptr(reinterpret_cast<void*>(stream), stream_name), true};
+                 return p.eval(pm, exec_env);
+             })
        .def("sort", &migraphx::program::sort)
        .def("print", [](const migraphx::program& p) { std::cout << p << std::endl; })
        .def("__eq__", std::equal_to<migraphx::program>{})
        .def("__ne__", std::not_equal_to<migraphx::program>{})
        .def("__repr__", [](const migraphx::program& p) { return migraphx::to_string(p); });

-    py::class_<migraphx::operation>(m, "op")
-        .def(py::init([](const std::string& name, py::kwargs kwargs) {
-            migraphx::value v = migraphx::value::object{};
-            if(kwargs)
-            {
-                v = migraphx::to_value(kwargs);
-            }
-            return migraphx::make_op(name, v);
-        }))
-
+    py::class_<migraphx::operation> op(m, "op");
+    op.def(py::init([](const std::string& name, py::kwargs kwargs) {
+          migraphx::value v = migraphx::value::object{};
+          if(kwargs)
+          {
+              v = migraphx::to_value(kwargs);
+          }
+          return migraphx::make_op(name, v);
+      }))
        .def("name", &migraphx::operation::name);

+    py::enum_<migraphx::op::pooling_mode>(op, "pooling_mode")
+        .value("average", migraphx::op::pooling_mode::average)
+        .value("max", migraphx::op::pooling_mode::max)
+        .value("lpnorm", migraphx::op::pooling_mode::lpnorm);
+
+    py::enum_<migraphx::op::rnn_direction>(op, "rnn_direction")
+        .value("forward", migraphx::op::rnn_direction::forward)
+        .value("reverse", migraphx::op::rnn_direction::reverse)
+        .value("bidirectional", migraphx::op::rnn_direction::bidirectional);
+
+    m.def(
+        "argument_from_pointer",
+        [](const migraphx::shape shape, const int64_t address) {
+            return migraphx::argument(shape, reinterpret_cast<void*>(address));
+        },
+        py::arg("shape"),
+        py::arg("address"));
+
    m.def(
        "parse_tf",
        [](const std::string& filename,

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -70,7 +70,7 @@ void quantize_int8(program& prog,
 {
    std::set<std::string> op_names = {"convolution", "dot"};
    std::set<std::string> input_ins_names(ins_names.begin(), ins_names.end());
-    if(!std::includes(
+    if(not std::includes(
           op_names.begin(), op_names.end(), input_ins_names.begin(), input_ins_names.end()))
    {
        MIGRAPHX_THROW("QUANTIZE_INT8: only support DOT and CONVOLUTION operation");

--- a/src/replace_allocate.cpp
+++ b/src/replace_allocate.cpp
@@ -73,7 +73,7 @@ void insert_submod_allocations(instruction_ref ins, module& mod, const allocatio
        name_shapes.insert(ps.begin(), ps.end());
    }

-    for(auto& pn : name_shapes)
+    for(const auto& pn : name_shapes)
    {
        const auto& s = pn.second;
        instruction_ref output{};

--- a/src/rewrite_batchnorm.cpp
+++ b/src/rewrite_batchnorm.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/rewrite_batchnorm.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
-#include <migraphx/op/broadcast.hpp>
-#include <migraphx/op/add.hpp>
-#include <migraphx/op/mul.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/ranges.hpp>
-#include <migraphx/make_op.hpp>
-
-#include <migraphx/dfor.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-
-void rewrite_batchnorm::apply(module& m) const
-{
-    for(auto ins : iterator_for(m))
-    {
-        if(ins->name() != "batch_norm_inference")
-            continue;
-        // Get scale, bias, mean, variance from inputs
-        auto gamma    = ins->inputs()[1]->eval();
-        auto bias     = ins->inputs()[2]->eval();
-        auto mean     = ins->inputs()[3]->eval();
-        auto variance = ins->inputs()[4]->eval();
-        if(any_of({gamma, bias, mean, variance}, [](auto arg) { return arg.empty(); }))
-            continue;
-
-        std::vector<std::size_t> lens = ins->inputs()[1]->get_shape().lens();
-        shape s{ins->get_shape().type(), lens};
-        // Get epsilon
-        auto bn_op   = any_cast<op::batch_norm_inference>(ins->get_operator());
-        auto epsilon = bn_op.epsilon;
-
-        argument a{s};
-        argument b{s};
-        visit_all(gamma, bias, mean, variance, a, b)(
-            [&](auto gamma2, auto bias2, auto mean2, auto variance2, auto a2, auto b2) {
-                dfor(a.get_shape().elements())(
-                    [&](std::size_t c) { a2[c] = gamma2[c] / std::sqrt(variance2[c] + epsilon); });
-                dfor(b.get_shape().elements())([&](std::size_t c) {
-                    b2[c] = bias2[c] - (gamma2[c] * mean2[c] / std::sqrt(variance2[c] + epsilon));
-                });
-            });
-
-        auto broadcast   = op::broadcast{1, ins->get_shape().lens()};
-        auto a_ins       = m.add_literal({a.get_shape(), a.data()});
-        auto a_broadcast = m.insert_instruction(ins, broadcast, a_ins);
-        auto mul   = m.insert_instruction(ins, make_op("mul"), ins->inputs().front(), a_broadcast);
-        auto b_ins = m.add_literal({b.get_shape(), b.data()});
-        auto b_broadcast = m.insert_instruction(ins, broadcast, b_ins);
-        auto add         = m.insert_instruction(ins, make_op("add"), mul, b_broadcast);
-        m.replace_instruction(ins, add);
-    }
-}
-
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/include/migraphx/op/batch_norm_inference.hpp
+++ b/src/include/migraphx/op/batch_norm_inference.hpp
@@ -21,50 +21,39 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_OPERATORS_BATCH_NORM_HPP
-#define MIGRAPHX_GUARD_OPERATORS_BATCH_NORM_HPP

-#include <migraphx/check_shapes.hpp>
-#include <migraphx/config.hpp>
-#include <cmath>
+#include <migraphx/rewrite_gelu.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/match/gelu_erf.hpp>
+#include <migraphx/common.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace op {

-struct batch_norm_inference
+struct find_gelu_erf
 {
-    float epsilon  = 1.0e-6f;
-    float momentum = 0.9f;
+    auto matcher() const { return match::gelu_erf(); }

-    std::string name() const { return "batch_norm_inference"; }
-
-    enum bn_infer_mode_t
+    void apply(module& m, const match::matcher_result& r) const
    {
-        per_activation,
-        spatial,
-    };
-
-    bn_infer_mode_t bn_mode = spatial;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return pack(
-            f(self.epsilon, "epsilon"), f(self.momentum, "momentum"), f(self.bn_mode, "bn_mode"));
-    }
-
-    shape compute_shape(std::vector<shape> inputs) const
-    {
-        check_shapes{inputs, *this}.has(5);
-        check_shapes{inputs.data(), inputs.data() + 1, *this}.same_ndims();
-        check_shapes{inputs.data() + 1, inputs.data() + inputs.size(), *this}.same_shape();
-        return inputs.front();
+        auto ins = r.result;
+        auto x   = r.instructions["x"];
+        if(x->get_shape().type() != migraphx::shape::half_type)
+            return;
+
+        auto lit = m.add_literal(literal{shape{x->get_shape().type()}, {1.702f}});
+        auto mul = insert_common_op(m, ins, make_op("mul"), {x, lit});
+        auto sig = m.insert_instruction(ins, make_op("neg"), mul);
+        sig      = m.insert_instruction(ins, make_op("exp"), sig);
+        auto one = m.add_literal(literal{shape{x->get_shape().type()}, {1.0f}});
+        sig      = insert_common_op(m, ins, make_op("add"), {sig, one});
+        sig      = m.insert_instruction(ins, make_op("div"), x, sig);
+        m.replace_instruction(ins, sig);
    }
 };

-} // namespace op
+void rewrite_gelu::apply(module& m) const { match::find_matches(m, find_gelu_erf{}); }
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
--- a/src/rewrite_pooling.cpp
+++ b/src/rewrite_pooling.cpp
@@ -47,12 +47,12 @@ void rewrite_pooling::apply(module& m) const
        if(not s.standard())
            continue;
        auto&& op = any_cast<op::pooling>(ins->get_operator());
-        if(!std::all_of(op.padding.begin(), op.padding.end(), [](auto i) { return i == 0; }))
+        if(not std::all_of(op.padding.begin(), op.padding.end(), [](auto i) { return i == 0; }))
            continue;
-        if(!std::all_of(op.stride.begin(), op.stride.end(), [](auto i) { return i == 1; }))
+        if(not std::all_of(op.stride.begin(), op.stride.end(), [](auto i) { return i == 1; }))
            continue;
        auto lens = s.lens();
-        if(!std::equal(lens.begin() + 2, lens.end(), op.lengths.begin(), op.lengths.end()))
+        if(not std::equal(lens.begin() + 2, lens.end(), op.lengths.begin(), op.lengths.end()))
            continue;
        std::int64_t n = s.lens()[0];
        std::int64_t c = s.lens()[1];

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -46,9 +46,6 @@
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/ranges.hpp>
-#include <migraphx/op/common.hpp>
-#include <migraphx/op/rnn_var_sl_last_output.hpp>
-#include <migraphx/op/rnn_variable_seq_lens.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -214,7 +211,7 @@ void rewrite_rnn::apply_vanilla_rnn(module& m, instruction_ref ins) const
            ih = m.add_literal(migraphx::literal{ih_shape, data});
        }

-        if(!is_forward and variable_seq_len)
+        if(not is_forward and variable_seq_len)
        {
            args[0] =
                m.insert_instruction(ins, make_op("rnn_var_sl_shift_sequence"), args[0], seq_lens);
@@ -520,7 +517,7 @@ void rewrite_rnn::apply_gru(module& m, instruction_ref ins) const
            ih = m.add_literal(migraphx::literal{ih_shape, data});
        }

-        if(!is_forward and variable_seq_len)
+        if(not is_forward and variable_seq_len)
        {
            args[0] =
                m.insert_instruction(ins, make_op("rnn_var_sl_shift_sequence"), args[0], seq_lens);
@@ -977,7 +974,7 @@ void rewrite_rnn::apply_lstm(module& m, instruction_ref ins) const
            pph = args[7];
        }

-        if(!is_forward and variable_seq_len)
+        if(not is_forward and variable_seq_len)
        {
            args[0] =
                m.insert_instruction(ins, make_op("rnn_var_sl_shift_sequence"), args[0], seq_lens);
@@ -1294,11 +1291,11 @@ bool rewrite_rnn::is_variable_seq_lens(const module& m, instruction_ref seq_lens
            std::vector<int64_t> vec_lens;
            arg_lens.visit([&](auto l) { vec_lens.assign(l.begin(), l.end()); });
            int64_t l = 0;
-            if(!vec_lens.empty())
+            if(not vec_lens.empty())
            {
                l = vec_lens[0];
            }
-            if(!std::all_of(vec_lens.begin(), vec_lens.end(), [&](auto v) { return v == l; }))
+            if(not std::all_of(vec_lens.begin(), vec_lens.end(), [&](auto v) { return v == l; }))
            {
                is_var_lens = true;
            }
@@ -1318,7 +1315,7 @@ rewrite_rnn::get_seq_len(const module& m, instruction_ref input, instruction_ref
    bool is_var_lens = is_variable_seq_lens(m, seq_lens);
    auto input_shape = input->get_shape();
    auto length      = input_shape.lens()[0];
-    if(!is_var_lens and seq_lens != m.end())
+    if(not is_var_lens and seq_lens != m.end())
    {
        auto arg_len = seq_lens->eval();
        std::vector<std::size_t> vec_lens;
@@ -1387,7 +1384,7 @@ void rewrite_rnn::replace_last_cell_output(module& m,

    if(variable_seq_len)
    {
-        if(!ins_outputs.empty())
+        if(not ins_outputs.empty())
        {
            cell_outputs = m.insert_instruction(
                std::next(ins),

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -71,6 +71,19 @@ struct shape_impl
    {
    }

+    shape_impl(shape::type_t t,
+               std::vector<std::size_t> mins,
+               std::vector<std::size_t> maxes,
+               std::vector<std::size_t> opts)
+        : m_type(t)
+    {
+        assert(mins.size() == maxes.size() and maxes.size() == opts.size());
+        for(size_t i = 0; i < mins.size(); ++i)
+        {
+            m_dyn_dims.push_back(shape::dynamic_dimension{mins[i], maxes[i], opts[i]});
+        }
+    }
+
    shape_impl(const std::vector<shape>& subs) : m_type(shape::tuple_type), m_shapes(subs) {}

    shape::type_t m_type;
@@ -224,6 +237,14 @@ shape::shape(type_t t, std::vector<shape::dynamic_dimension> dims)
 {
 }

+shape::shape(type_t t,
+             std::vector<std::size_t> mins,
+             std::vector<std::size_t> maxes,
+             std::vector<std::size_t> opts)
+    : impl(std::make_shared<shape_impl>(t, std::move(mins), std::move(maxes), std::move(opts)))
+{
+}
+
 shape::shape(const std::vector<shape>& subs) : impl(std::make_shared<shape_impl>(subs)) {}

 shape::shape(std::shared_ptr<shape_impl> pimpl) : impl(std::move(pimpl)) {}
@@ -244,6 +265,15 @@ const std::vector<std::size_t>& shape::lens() const { return impl->m_lens; }

 const std::vector<std::size_t>& shape::strides() const { return impl->m_strides; }

+std::size_t shape::ndim() const
+{
+    if(this->dynamic())
+    {
+        return dyn_dims().size();
+    }
+    return lens().size();
+}
+
 std::size_t shape::elements() const { return impl->elements(); }

 std::size_t shape::bytes() const
@@ -437,6 +467,16 @@ shape shape::with_type(type_t t) const
    return {c};
 }

+shape shape::to_dynamic() const
+{
+    if(this->dynamic())
+    {
+        return *this;
+    }
+    std::vector<std::size_t> zeroes(this->ndim(), 0);
+    return {type(), lens(), lens(), zeroes};
+}
+
 std::size_t shape::element_space() const { return impl->element_space(); }

 std::string shape::type_string() const { return name(this->type()); }
@@ -464,20 +504,16 @@ bool shape::dynamic_dimension::is_fixed() const { return this->min == this->max;

 bool shape::dynamic_dimension::has_optimal() const { return opt != 0; }

-template <class Self, class F>
-auto shape::dynamic_dimension::reflect(Self& self, F f)
-{
-    return pack(f(self.min, "min"), f(self.max, "max"), f(self.opt, "opt"));
-}
-
 bool operator==(const shape::dynamic_dimension& x, const shape::dynamic_dimension& y)
 {
-    return (x.min == y.min and x.max == y.max and x.opt == y.opt);
+    // don't check opt if both are fixed
+    return (x.min == y.min and x.max == y.max and
+            ((x.is_fixed() and y.is_fixed()) or (x.opt == y.opt)));
 }

 bool operator!=(const shape::dynamic_dimension& x, const shape::dynamic_dimension& y)
 {
-    return !(x == y);
+    return not(x == y);
 }
 std::ostream& operator<<(std::ostream& os, const shape::dynamic_dimension& x)
 {
@@ -497,7 +533,7 @@ bool operator==(const shape& x, const shape& y)
            x.strides() == y.strides() and x.sub_shapes() == y.sub_shapes());
 }

-bool operator!=(const shape& x, const shape& y) { return !(x == y); }
+bool operator!=(const shape& x, const shape& y) { return not(x == y); }

 std::ostream& operator<<(std::ostream& os, const shape& x)
 {

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
@@ -57,12 +57,14 @@ auto conv_const_weights()

 auto reduction() { return match::name_contains("reduce"); }

+// conv(x, w) * a => conv(x, a * w)
 struct find_mul_conv
 {
    auto matcher() const
    {
-        return match::name("mul")(match::either_arg(0, 1)(conv_const_weights().bind("conv"),
-                                                          match::name("broadcast").bind("a")));
+        return match::name("mul")(
+            match::either_arg(0, 1)(conv_const_weights().bind("conv"),
+                                    match::name("broadcast", "multibroadcast").bind("a")));
    }

    void apply(module& m, const match::matcher_result& r) const
@@ -72,14 +74,35 @@ struct find_mul_conv
        auto a_ins    = r.instructions["a"];
        auto w_ins    = r.instructions["w"];

-        auto broadcast_op = any_cast<op::broadcast>(a_ins->get_operator());
-        if(broadcast_op.axis != 1)
+        const auto& a_input_lens = a_ins->inputs().front()->get_shape().lens();
+
+        std::size_t num_not_one_dims = std::count_if(
+            a_input_lens.cbegin(), a_input_lens.cend(), [](auto dim) { return dim != 1; });
+        if(num_not_one_dims > 1)
+            return;
+
+        // check broadcasted along channels
+        const auto& a_lens    = a_ins->get_shape().lens();
+        const auto& a_strides = a_ins->get_shape().strides();
+
+        auto is_broadcasted_axis = [](auto len, auto stride) { return len == 1 or stride == 0; };
+
+        if(a_strides.at(1) != 1)
            return;

+        if(not is_broadcasted_axis(a_lens.front(), a_strides.front()))
+            return;
+
+        if(not std::equal(a_lens.begin() + 2,
+                          a_lens.end(),
+                          a_strides.begin() + 2,
+                          a_strides.end(),
+                          is_broadcasted_axis))
+            return;
+
+        auto sq    = m.insert_instruction(ins, make_op("squeeze"), a_ins->inputs().front());
        auto new_a = m.insert_instruction(
-            ins,
-            make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}),
-            a_ins->inputs().front());
+            ins, make_op("broadcast", {{"axis", 0}, {"out_lens", w_ins->get_shape().lens()}}), sq);
        auto new_mul  = m.insert_instruction(ins, make_op("mul"), new_a, w_ins);
        auto new_conv = m.insert_instruction(
            ins, conv_ins->get_operator(), conv_ins->inputs().front(), new_mul);
@@ -208,6 +231,42 @@ struct find_mul_add
    }
 };

+struct find_dot_add
+{
+    auto matcher() const
+    {
+        return match::name("dot")(match::either_arg(0, 1)(
+            match::name("add")(
+                match::either_arg(0, 1)(match::any().bind("x"),
+                                        match::any_of(match::is_constant()).bind("b")),
+                match::none_of(match::args(match::is_constant(), match::is_constant())),
+                match::used_once()),
+            match::is_constant().bind("a")));
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins   = r.result;
+        auto a_ins = r.instructions["a"];
+        auto b_ins = r.instructions["b"];
+        auto x_ins = r.instructions["x"];
+        assert(x_ins != b_ins);
+
+        const bool flipped = a_ins == ins->inputs().back();
+
+        auto insert_dot = [&](auto x, auto y) {
+            if(flipped)
+                return m.insert_instruction(ins, make_op("dot"), y, x);
+            else
+                return m.insert_instruction(ins, make_op("dot"), x, y);
+        };
+
+        auto ax_ins = insert_dot(a_ins, x_ins);
+        auto ab_ins = insert_dot(a_ins, b_ins);
+        m.replace_instruction(ins, make_op("add"), ax_ins, ab_ins);
+    }
+};
+
 struct find_add_lit_broadcast
 {
    auto matcher() const
@@ -267,28 +326,26 @@ struct find_double_add_lit_broadcast

 struct find_inner_broadcast
 {
-    auto matcher() const
-    {
-        return pointwise(
-            match::nargs(2),
-            match::args(match::name("broadcast").bind("x"), match::name("broadcast").bind("y")));
-    }
+    auto matcher() const { return pointwise(match::all_of[match::inputs()](match::broadcast())); }

    void apply(module& m, const match::matcher_result& r) const
    {
-        auto ins   = r.result;
-        auto x_ins = r.instructions["x"];
-        auto y_ins = r.instructions["y"];
-
-        auto xbroadcast = any_cast<op::broadcast>(x_ins->get_operator());
-        auto ybroadcast = any_cast<op::broadcast>(y_ins->get_operator());
-
-        if(xbroadcast.axis != ybroadcast.axis)
+        auto ins        = r.result;
+        auto broadcasts = ins->inputs();
+        if(broadcasts.empty())
+            return;
+        std::vector<instruction_ref> inputs;
+        std::transform(broadcasts.begin(),
+                       broadcasts.end(),
+                       std::back_inserter(inputs),
+                       [](auto i) { return i->inputs().front(); });
+        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
+               return i->get_shape() != inputs.front()->get_shape();
+           }))
            return;

-        auto op = m.insert_instruction(
-            ins, ins->get_operator(), x_ins->inputs().front(), y_ins->inputs().front());
-        m.replace_instruction(ins, xbroadcast, op);
+        auto op = m.insert_instruction(ins, ins->get_operator(), inputs);
+        m.replace_instruction(ins, broadcasts.front()->get_operator(), op);
    }
 };

@@ -378,6 +435,24 @@ struct find_concat_op
    }
 };

+void move_instructions_back(module& m, instruction_ref pos, std::vector<instruction_ref> inss)
+{
+    auto start = range(m.begin(), pos);
+    for(auto ins : iterator_for(start))
+    {
+        auto it = std::find(inss.begin(), inss.end(), ins);
+        if(it != inss.end())
+            inss.erase(it);
+    }
+    for(auto ins : inss)
+    {
+        if(not m.has_instruction(ins))
+            continue;
+        move_instructions_back(m, pos, ins->inputs());
+        m.move_instruction(ins, pos);
+    }
+}
+
 std::vector<instruction_ref> get_splits(instruction_ref ins)
 {
    std::vector<instruction_ref> result;
@@ -416,8 +491,9 @@ struct find_splits
 {
    auto matcher() const
    {
-        return match::any(match::any_of[match::outputs()](match::name("slice")(
-            match::any_of[match::outputs()](match::pointwise(), reduction()))));
+        return match::any(
+            match::any_of[match::outputs()](match::name("slice")(match::any_of[match::outputs()](
+                match::pointwise(match::any_of(match::nargs(1), match::nargs(2))), reduction()))));
    }

    static bool is_dependent(const module& m, instruction_ref ins1, instruction_ref ins2)
@@ -552,8 +628,7 @@ struct find_splits
                   }))
                    return;

-                for(auto data : data_args)
-                    m.move_instructions(data, ins);
+                move_instructions_back(m, ins, data_args);

                auto slice_op = any_cast<op::slice>(splits.front()->get_operator());
                assert(not slice_op.axes.empty());
@@ -580,10 +655,9 @@ struct find_splits
                    auto outputs = i->outputs();
                    for(auto output : outputs)
                    {
-                        if(not contains({"reshape", "squeeze", "unsqueeze"}, output->name()))
+                        if(output->name() != "reshape")
                            continue;
-                        auto x =
-                            m.insert_instruction(output, make_op("contiguous"), output->inputs());
+                        auto x = m.insert_instruction(output, make_op("contiguous"), i);
                        m.replace_instruction(output, output->get_operator(), x);
                    }

@@ -753,7 +827,7 @@ MIGRAPHX_PRED_MATCHER(horiz_conv_dot, instruction_ref ins)
    };
    auto dots  = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("dot"));
    auto convs = std::count_if(ins->outputs().begin(), ins->outputs().end(), pred("convolution"));
-    return !(dots < 2 and convs < 2);
+    return (dots >= 2 or convs >= 2);
 }

 struct find_conv_dot_horiz_fusion
@@ -773,7 +847,7 @@ struct find_conv_dot_horiz_fusion
            auto y = j->inputs()[1]->get_shape().lens();
            if(x.size() != y.size())
                return false;
-            // Check that non-axises match
+            // Check that non-axes match
            int axis = 1;
            if(i->name() == "dot")
            {
@@ -807,15 +881,23 @@ struct find_conv_dot_horiz_fusion
                concat_axis = axis;
            }

-            for(auto arg : args)
-                m.move_instructions(arg, input);
-            // TODO: Check if axises match
+            move_instructions_back(m, input, args);
+            // TODO: Check if axes match
            auto concat =
                m.insert_instruction(input, make_op("concat", {{"axis", concat_axis}}), args);
            auto fused     = m.insert_instruction(std::next(input), op, input, concat);
            int64_t offset = 0;
            for(auto arg : range(start, last))
            {
+                auto outputs = arg->outputs();
+                for(auto output : outputs)
+                {
+                    if(output->name() != "reshape")
+                        continue;
+                    auto x = m.insert_instruction(output, make_op("contiguous"), arg);
+                    m.replace_instruction(output, output->get_operator(), x);
+                }
+
                int64_t len = arg->get_shape().lens()[axis];
                m.replace_instruction(
                    arg,
@@ -851,6 +933,73 @@ struct find_div_const
    }
 };

+struct find_unit_ops
+{
+    auto matcher() const
+    {
+        auto mul_1 = match::name("mul")(
+            match::either_arg(0, 1)(match::has_value(1.0f), match::any().bind("x")));
+        auto div_1 =
+            match::name("div")(match::args(match::any().bind("x"), match::has_value(1.0f)));
+        auto add_0 = match::name("add")(
+            match::either_arg(0, 1)(match::has_value(0.0f, 1e-12), match::any().bind("x")));
+        auto sub_0 =
+            match::name("sub")(match::args(match::any().bind("x"), match::has_value(0.0f)));
+        return match::any_of(mul_1, div_1, add_0, sub_0);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins  = r.result;
+        auto c_in = r.instructions["x"];
+
+        m.replace_instruction(ins, c_in);
+    }
+};
+
+struct find_neg_unit_ops
+{
+    auto matcher() const
+    {
+        auto mul_neg_1 = match::name("mul")(
+            match::either_arg(0, 1)(match::has_value(-1.0f), match::any().bind("x")));
+        auto div_neg_1 =
+            match::name("div")(match::args(match::any().bind("x"), match::has_value(-1.0f)));
+        auto sub_0 =
+            match::name("sub")(match::args(match::has_value(0.0f), match::any().bind("x")));
+        return match::any_of(mul_neg_1, div_neg_1, sub_0);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins  = r.result;
+        auto c_in = r.instructions["x"];
+
+        auto neg = m.add_instruction(make_op("neg"), c_in);
+        m.replace_instruction(ins, neg);
+    }
+};
+
+struct find_zero_ops
+{
+    auto matcher() const
+    {
+        auto mul_zero = match::name("mul")(
+            match::either_arg(0, 1)(match::has_value(0.0f).bind("x"), match::any()));
+        auto div_zero =
+            match::name("div")(match::args(match::has_value(0.0f).bind("x"), match::any()));
+        return match::any_of(mul_zero, div_zero);
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto zero_ins = r.instructions["x"];
+
+        m.replace_instruction(ins, zero_ins);
+    }
+};
+
 struct find_sub_const
 {
    auto matcher() const
@@ -926,7 +1075,7 @@ struct find_split_reshape

        // all outputs are reshape and of the same shape
        auto dims = any_cast<op::reshape>(rsp->get_operator()).dims;
-        if(!same_ops(vec_rsp))
+        if(not same_ops(vec_rsp))
        {
            return;
        }
@@ -942,23 +1091,42 @@ struct find_split_reshape
        auto rsp_lens    = rsp->get_shape().lens();
        auto rsp_strides = rsp->get_shape().strides();
        rsp_strides.insert(rsp_strides.begin(), rsp_strides[0] * rsp_lens[0]);
-        auto ait = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+
+        auto ait     = std::find(rsp_strides.begin(), rsp_strides.end(), slc_dim_size);
+        int rsp_axis = -1;
        if(ait == rsp_strides.end())
        {
            return;
        }
-        int rsp_axis = std::distance(rsp_strides.begin(), ait);
-
+        else if(ait == rsp_strides.end() - 1)
+        {
+            // edge case
+            // slice_dim == 1, in that case it could match with last stride of 1.
+            // it should accumulate lengths from last dim in that case. discount 1 to avoid going
+            // out of bounds.
+            assert(slc_dim_size == 1);
+            rsp_axis = std::distance(rsp_strides.begin(), ait) - 1;
+        }
+        else
+        {
+            rsp_axis = std::distance(rsp_strides.begin(), ait);
+        }
        // calculate reshape output shape
        std::vector<int64_t> vec_dims(vec_rsp.size());
+
        std::transform(vec_rsp.begin(), vec_rsp.end(), vec_dims.begin(), [&](auto is) {
            return is->get_shape().lens()[rsp_axis];
        });

        std::vector<int64_t> rsp_out_lens(rsp_lens.begin(), rsp_lens.end());
+
        rsp_out_lens[rsp_axis] = std::accumulate(vec_dims.begin(), vec_dims.end(), std::int64_t{0});

-        // insert the reshape instruction
+        // insert the reshape instruction and add contiguous if needed
+        if(not input->get_shape().standard())
+        {
+            input = m.insert_instruction(std::next(input), make_op("contiguous"), input);
+        }
        auto rsp_ins = m.insert_instruction(
            std::next(input), make_op("reshape", {{"dims", rsp_out_lens}}), input);

@@ -1005,7 +1173,7 @@ struct find_split_transpose

        // all transpose are the same
        auto perm = any_cast<op::transpose>(trans->get_operator()).dims;
-        if(!same_ops(vec_trans))
+        if(not same_ops(vec_trans))
        {
            return;
        }
@@ -1048,6 +1216,10 @@ void simplify_algebra::apply(module& m) const
                            find_mul_conv{},
                            find_mul_slice_conv{},
                            find_mul_add{},
+                            find_unit_ops{},
+                            find_neg_unit_ops{},
+                            find_zero_ops{},
+                            find_dot_add{},
                            find_div_const{},
                            find_sub_const{},
                            find_rsqrt{},

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -99,7 +99,7 @@ struct find_reshaper
        std::vector<instruction_ref> reshapes{ins};
        while(is_reshaper(reshapes.back()))
        {
-            assert(!reshapes.back()->inputs().empty());
+            assert(not reshapes.back()->inputs().empty());
            assert(m.has_instruction(reshapes.back()->inputs().front()));
            auto input = reshapes.back()->inputs().front();
            reshapes.push_back(input);
@@ -151,8 +151,11 @@ struct find_transpose
 {
    auto matcher() const
    {
-        return match::name("transpose")(match::none_of(
-            match::skip_output(match::name("contiguous"))(match::name("transpose"))));
+        auto output_not_transpose =
+            match::none_of(match::skip_output(match::name("contiguous"))(match::name("transpose")));
+        auto input_has_transpose =
+            match::args(match::skip(match::name("contiguous"))(match::name("transpose")));
+        return match::name("transpose")(output_not_transpose, input_has_transpose);
    }

    void apply(module& m, const match::matcher_result& mr) const
@@ -268,6 +271,44 @@ struct find_nested_slice
    }
 };

+struct find_concat_multibroadcasts
+{
+    auto matcher() const
+    {
+        return match::name("concat")(match::all_of[match::inputs()](match::name("multibroadcast")));
+    }
+
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins        = mr.result;
+        auto op         = any_cast<op::concat>(ins->get_operator());
+        auto out_lens   = ins->get_shape().lens();
+        auto inputs     = ins->inputs();
+        auto in_strides = inputs.front()->get_shape().strides();
+
+        // Only apply when concat axis is not a broadcasted dimension
+        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
+               return i->get_shape().strides()[op.axis] == 0;
+           }))
+        {
+            return;
+        }
+
+        // Use inputs of multibroadcast ops as inputs to new concat op
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [](auto i) {
+            return i->inputs().front();
+        });
+
+        // Reduce axis by number of leading broadcasted dimensions
+        if(inputs.front()->get_shape().lens().size() < out_lens.size())
+            op.axis -= std::count(in_strides.begin(), in_strides.begin() + op.axis, 0);
+
+        auto concat = m.insert_instruction(ins, op, inputs);
+        m.replace_instruction(
+            ins, migraphx::make_op("multibroadcast", {{"out_lens", out_lens}}), concat);
+    }
+};
+
 struct find_concat_transpose
 {
    auto matcher() const
@@ -285,7 +326,7 @@ struct find_concat_transpose
        auto permutation = find_permutation(s);

        // permutation should be the same for all inputs
-        if(!std::all_of(trans_inputs.begin(), trans_inputs.end(), [&](auto in) {
+        if(not std::all_of(trans_inputs.begin(), trans_inputs.end(), [&](auto in) {
               return (find_permutation(in->get_shape()) == permutation);
           }))
        {
@@ -664,9 +705,94 @@ struct find_slice_transpose
    }
 };

+struct find_transpose_slice
+{
+    auto matcher() const
+    {
+        return match::name("transpose")(match::all_of[match::outputs()](match::name("slice")));
+    }
+
+    static std::vector<int64_t> slice_distance(const op::slice& op)
+    {
+        assert(op.starts.size() == op.ends.size());
+        std::vector<int64_t> result(op.starts.size());
+        std::transform(
+            op.ends.begin(), op.ends.end(), op.starts.begin(), result.begin(), std::minus<>{});
+        return result;
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins    = r.result;
+        auto slices = ins->outputs();
+        if(slices.empty())
+            return;
+        auto slice     = any_cast<op::slice>(slices.front()->get_operator());
+        auto sdistance = slice_distance(slice);
+        // Check all distances and axes are the same
+        if(std::any_of(slices.begin(), slices.end(), [&](auto sins) {
+               auto s = any_cast<op::slice>(sins->get_operator());
+               return s.axes != slice.axes or slice_distance(s) != sdistance;
+           }))
+            return;
+        // Check distances are divisible by lens of corresponding axes
+        auto mod_by_distance = [&](const auto& v, auto f) {
+            return std::inner_product(v.begin(),
+                                      v.end(),
+                                      sdistance.begin(),
+                                      0,
+                                      std::plus<>{},
+                                      [&](auto x, auto d) -> uint64_t {
+                                          if(d == 0)
+                                              return 1;
+                                          return f(x) % d;
+                                      });
+        };
+        if(mod_by_distance(slice.axes, [&](auto x) { return ins->get_shape().lens()[x]; }) != 0 or
+           mod_by_distance(slice.starts, id{}) != 0 or mod_by_distance(slice.ends, id{}) != 0)
+            return;
+        // TODO: Handle multiple axes
+        if(sdistance.size() != 1)
+            return;
+        auto axis = slice.axes.front();
+        // Skip if axis would be packed
+        if(std::all_of(ins->get_shape().lens().begin(),
+                       ins->get_shape().lens().begin() + axis,
+                       [](auto x) { return x == 1; }))
+            return;
+        // Compute axis before transpose to use for unsqueeze
+        auto perm    = ins->get_operator().to_value()["permutation"].to_vector<int64_t>();
+        auto preaxis = std::find(perm.begin(), perm.end(), axis) - perm.begin();
+        // Make unsqeeze
+        auto unsqueeze = m.insert_instruction(
+            ins, make_op("unsqueeze", {{"axes", {preaxis}}, {"steps", sdistance}}), ins->inputs());
+        // Make transpose
+        std::transform(perm.begin(), perm.end(), perm.begin(), [&](auto i) {
+            if(i > preaxis)
+                return i + 1;
+            return i;
+        });
+        perm.insert(perm.begin(), preaxis + 1);
+        auto transpose =
+            m.insert_instruction(ins, make_op("transpose", {{"permutation", perm}}), unsqueeze);
+        // Slice and squeeze
+        for(auto s : slices)
+        {
+            auto op        = any_cast<op::slice>(s->get_operator());
+            op.axes        = {0};
+            op.starts      = {op.starts.front() / sdistance.front()};
+            op.ends        = {op.ends.front() / sdistance.front()};
+            auto slice_ins = m.insert_instruction(ins, op, transpose);
+            auto squeeze =
+                m.insert_instruction(ins, make_op("squeeze", {{"axes", {0}}}), slice_ins);
+            m.replace_instruction(s, squeeze);
+        }
+    }
+};
+
 void simplify_reshapes::apply(module& m) const
 {
-    for(int i = 0; i < 2; i++)
+    for(int i = 0; i < 4; i++)
    {
        match::find_matches(m,
                            find_where_op{},
@@ -676,9 +802,11 @@ void simplify_reshapes::apply(module& m) const
                            find_reshaper{},
                            find_transpose{},
                            find_concat_transpose{},
+                            find_concat_multibroadcasts{},
                            find_nested_convert{},
                            find_nested_slice{},
                            find_nested_concat{},
+                            find_transpose_slice{},
                            find_slice_transpose{},
                            find_transpose_contiguous_reshaper_unary{});
        dead_code_elimination{}.apply(m);

--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -35,6 +35,7 @@ add_library(migraphx_cpu
    dnnl.cpp
    eltwise.cpp
    erf.cpp
+    fmod.cpp
    fuse_ops.cpp
    gather.cpp
    gemm.cpp
@@ -42,6 +43,7 @@ add_library(migraphx_cpu
    logsoftmax.cpp
    lowering.cpp
    lrn.cpp
+    mod.cpp
    preallocate.cpp
    pooling.cpp
    reduction.cpp

--- a/src/targets/cpu/binary.cpp
+++ b/src/targets/cpu/binary.cpp
@@ -49,7 +49,7 @@ struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
        auto s0 = inputs.at(0);
        auto s1 = inputs.at(1);
        auto r  = s0;
-        if(s0 != s1 or !s0.packed())
+        if(s0 != s1 or not s0.packed())
        {
            r = shape{s0.type(), s0.lens()};
        }

--- a/src/target_assignments.cpp
+++ b/src/target_assignments.cpp
@@ -21,16 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-
-#include <migraphx/target_assignments.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/fmod.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+namespace cpu {

-void target_assignments::add_assignment(instruction_ref ins, const std::string& target)
-{
-    assignments.emplace(ins, target);
-}
+template struct cpu_binary<op::fmod>;

+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/identity.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/quant_convolution.hpp>
@@ -43,6 +42,8 @@
 #include <migraphx/op/argmax.hpp>
 #include <migraphx/op/argmin.hpp>
 #include <migraphx/op/rnn_var_sl_last_output.hpp>
+#include <migraphx/op/mod.hpp>
+#include <migraphx/op/fmod.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/par_dfor.hpp>
@@ -214,55 +215,6 @@ struct cpu_pad
 };
 MIGRAPHX_REGISTER_OP(cpu_pad)

-struct leaky_relu_op
-{
-    op::leaky_relu op;
-    std::string name() const { return "cpu::leaky_relu"; }
-    auto fcn() const
-    {
-        auto a = op.alpha;
-        return [a](auto x) { return x > 0 ? x : x * a; };
-    }
-};
-
-template <typename Op>
-struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
-{
-    cpu_unary2() = default;
-
-    template <class T>
-    cpu_unary2(T pop) : op(Op{std::move(pop)})
-    {
-    }
-
-    Op op;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op.op, f);
-    }
-    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        check_shapes{inputs, *this}.has(1);
-        const auto& s = inputs.at(0);
-        return {s.type(), s.lens()};
-    }
-
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            assert(input.get_shape().standard());
-            std::transform(input.begin(), input.end(), output.begin(), op.fcn());
-        });
-
-        return result;
-    }
-};
-template struct cpu_unary2<leaky_relu_op>;
-
 struct cpu_rnn_var_sl_last_output
 {
    op::rnn_var_sl_last_output op;

--- a/src/targets/gpu/include/migraphx/gpu/cos.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/cos.hpp
@@ -21,22 +21,16 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_COS_HPP
-#define MIGRAPHX_GUARD_RTGLIB_COS_HPP
-
-#include <migraphx/gpu/oper.hpp>
-#include <migraphx/gpu/device/cos.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/cpu/pointwise.hpp>
+#include <migraphx/op/mod.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
+namespace cpu {

-struct hip_cos : unary_device<hip_cos, device::cos>
-{
-};
+template struct cpu_binary<op::mod>;

-} // namespace gpu
+} // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
-#endif
--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -37,12 +37,10 @@
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/replace_allocate.hpp>
-#include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/schedule.hpp>
-#include <migraphx/memory_coloring.hpp>
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
@@ -78,8 +76,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
            eliminate_identity{},
            eliminate_pad{},
            dead_code_elimination{},
-            rewrite_batchnorm{},
-            dead_code_elimination{},
            rewrite_rnn{},
            dead_code_elimination{},
            eliminate_common_subexpression{},

--- a/src/targets/fpga/include/migraphx/fpga/target.hpp
+++ b/src/targets/fpga/include/migraphx/fpga/target.hpp
@@ -30,6 +30,7 @@
 #include <migraphx/compile_options.hpp>
 #include <migraphx/fpga/context.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/supported_segments.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -41,7 +42,7 @@ struct target
    std::string name() const;
    std::vector<pass> get_passes(migraphx::context& ctx, const compile_options&) const;
    migraphx::context get_context() const { return context{}; }
-    float is_supported(instruction_ref ins, support_metric m);
+    supported_segments find_supported(const_module_ref mod, support_metric m) const;

    argument copy_to(const argument& arg) const { return arg; }
    argument copy_from(const argument& arg) const { return arg; }