Merge branch 'develop' into driver

eb0d8fee · Paul · 65ef35cd · 0d796941 · eb0d8fee · eb0d8fee
Commit eb0d8fee authored Jun 04, 2019 by Paul
20 changed files
--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -2,10 +2,16 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <migraphx/program.hpp>
+#include <migraphx/quantization.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/cpu/target.hpp>
-#include <migraphx/onnx.hpp>
 #include <migraphx/stringutils.hpp>
+#ifdef ENABLE_TF
+#include <migraphx/tf.hpp>
+#else
+#include <migraphx/onnx.hpp>
+#endif
+
 #ifdef HAVE_GPU
 #include <migraphx/gpu/target.hpp>
 #include <migraphx/gpu/hip.hpp>
@@ -28,6 +34,11 @@ struct throw_half
    {
        throw std::runtime_error("Half not supported in python yet.");
    }
+
+    void operator()(migraphx::tensor_view<migraphx::half>) const
+    {
+        throw std::runtime_error("Half not supported in python yet.");
+    }
 };

 template <class F>
@@ -42,6 +53,8 @@ struct skip_half
    }

    void operator()(migraphx::shape::as<migraphx::half>) const {}
+
+    void operator()(migraphx::tensor_view<migraphx::half>) const {}
 };

 template <class F>
@@ -50,6 +63,12 @@ void visit_type(const migraphx::shape& s, F f)
    s.visit_type(throw_half<F>{f});
 }

+template <class T, class F>
+void visit(const migraphx::raw_data<T>& x, F f)
+{
+    x.visit(throw_half<F>{f});
+}
+
 template <class F>
 void visit_types(F f)
 {
@@ -60,6 +79,9 @@ template <class T>
 py::buffer_info to_buffer_info(T& x)
 {
    migraphx::shape s = x.get_shape();
+    auto strides      = s.strides();
+    std::transform(
+        strides.begin(), strides.end(), strides.begin(), [&](auto i) { return i * s.type_size(); });
    py::buffer_info b;
    visit_type(s, [&](auto as) {
        b = py::buffer_info(x.data(),
@@ -67,7 +89,7 @@ py::buffer_info to_buffer_info(T& x)
                            py::format_descriptor<decltype(as())>::format(),
                            s.lens().size(),
                            s.lens(),
-                            s.strides());
+                            strides);
    });
    return b;
 }
@@ -75,11 +97,20 @@ py::buffer_info to_buffer_info(T& x)
 migraphx::shape to_shape(const py::buffer_info& info)
 {
    migraphx::shape::type_t t;
+    std::size_t n = 0;
    visit_types([&](auto as) {
        if(info.format == py::format_descriptor<decltype(as())>::format())
+        {
            t = as.type_enum();
+            n = sizeof(as());
+        }
+
+    });
+    auto strides = info.strides;
+    std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto i) -> std::size_t {
+        return n > 0 ? i / n : 0;
    });
-    return migraphx::shape{t, info.shape, info.strides};
+    return migraphx::shape{t, info.shape, strides};
 }

 PYBIND11_MODULE(migraphx, m)
@@ -108,6 +139,13 @@ PYBIND11_MODULE(migraphx, m)
                 py::buffer_info info = b.request();
                 new(&x) migraphx::argument(to_shape(info), info.ptr);
             })
+        .def("get_shape", &migraphx::argument::get_shape)
+        .def("tolist",
+             [](migraphx::argument& x) {
+                 py::list l{x.get_shape().elements()};
+                 visit(x, [&](auto data) { l = py::cast(data.to_vector()); });
+                 return l;
+             })
        .def("__eq__", std::equal_to<migraphx::argument>{})
        .def("__ne__", std::not_equal_to<migraphx::argument>{})
        .def("__repr__", [](const migraphx::argument& x) { return migraphx::to_string(x); });
@@ -123,8 +161,16 @@ PYBIND11_MODULE(migraphx, m)
        .def("__ne__", std::not_equal_to<migraphx::program>{})
        .def("__repr__", [](const migraphx::program& p) { return migraphx::to_string(p); });

+#ifdef ENABLE_TF
+    m.def("parse_tf",
+          &migraphx::parse_tf,
+          "Parse tf protobuf (default format is nhwc)",
+          py::arg("filename"),
+          py::arg("is_nhwc") = true);
+#else
    m.def("parse_onnx", &migraphx::parse_onnx);

+#endif
    m.def("get_target", [](const std::string& name) -> migraphx::target {
        if(name == "cpu")
            return migraphx::cpu::target{};
@@ -136,6 +182,10 @@ PYBIND11_MODULE(migraphx, m)
    });

    m.def("generate_argument", &migraphx::generate_argument, py::arg("s"), py::arg("seed") = 0);
+    m.def("quantize", [](migraphx::program& p, std::vector<std::string>& ins_names) {
+        migraphx::quantize(p, ins_names);
+    });
+    m.def("quantize", [](migraphx::program& p) { migraphx::quantize(p, {"all"}); });

 #ifdef HAVE_GPU
    m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
+#include <migraphx/quantization.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/op/convert.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/ranges.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+instruction_ref insert_fp16(program& prog,
+                            instruction_ref& ins,
+                            shape::type_t type,
+                            std::unordered_map<instruction_ref, instruction_ref>& map_fp16)
+{
+    if(map_fp16.count(ins) > 0)
+    {
+        return map_fp16[ins];
+    }
+
+    assert(ins->get_shape().type() == shape::float_type ||
+           ins->get_shape().type() == shape::double_type);
+    instruction_ref ins_fp16{};
+    ins_fp16      = prog.insert_instruction(std::next(ins), op::convert{type}, ins);
+    map_fp16[ins] = ins_fp16;
+
+    return ins_fp16;
+}
+
+void quantize(program& prog, const std::vector<std::string>& ins_names)
+{
+    std::unordered_map<instruction_ref, instruction_ref> map_fp16;
+    for(auto ins : iterator_for(prog))
+    {
+        // all indicates every instruction is converted
+        if((not contains(ins_names, "all")) and (not contains(ins_names, ins->name())))
+        {
+            continue;
+        }
+
+        shape::type_t orig_type = ins->get_shape().type();
+        // process all inputs, if input is a fp32 or fp64, convert it
+        // to a fp16 by adding a convert operator.
+        auto inputs = ins->inputs();
+        std::vector<instruction_ref> converted_inputs;
+        for(auto input : inputs)
+        {
+            auto s = input->get_shape();
+            if(s.type() == shape::float_type || s.type() == shape::double_type)
+            {
+                // if the input is a convert operator, uses its input
+                // as its current input
+                instruction_ref input_fp16{};
+                if(input->name() == "convert")
+                {
+                    input_fp16 = input->inputs().front();
+                }
+                else
+                {
+                    input_fp16 = insert_fp16(prog, input, shape::half_type, map_fp16);
+                }
+                converted_inputs.push_back(input_fp16);
+            }
+            else
+            {
+                converted_inputs.push_back(input);
+            }
+        }
+
+        // no change for the input, go to the next instruction
+        if(inputs == converted_inputs)
+        {
+            continue;
+        }
+
+        auto op        = ins->get_operator();
+        auto ins_shape = compute_shape(op, converted_inputs);
+        if(ins_shape.type() != orig_type)
+        {
+            // insert another convert instruction to convert it back
+            if(ins == std::prev(prog.end()))
+            {
+                prog.add_instruction(op::convert{orig_type}, ins);
+            }
+            else
+            {
+                // check the dead code case to avoid assert
+                bool output_empty = ins->outputs().empty();
+                auto ins_orig_type =
+                    prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
+                if(!output_empty)
+                {
+                    prog.replace_instruction(ins, ins_orig_type);
+                }
+            }
+        }
+
+        prog.replace_instruction(ins, op, converted_inputs);
+    }
+}
+
+void quantize(program& prog) { quantize(prog, {"all"}); }
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
@@ -4,6 +4,7 @@
 #include <migraphx/operators.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/dfor.hpp>
+#include <migraphx/op/common.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -16,11 +17,14 @@ void rewrite_rnn::apply(program& prog) const
        {
            apply_vanilla_rnn(prog, ins);
        }
-
-        if(ins->name() == "gru")
+        else if(ins->name() == "gru")
        {
            apply_gru(prog, ins);
        }
+        else if(ins->name() == "lstm")
+        {
+            apply_lstm(prog, ins);
+        }
    }
 }

@@ -210,7 +214,7 @@ std::vector<instruction_ref> rewrite_rnn::vanilla_rnn_cell(bool is_forward,
        auto wb    = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
        auto rb    = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
        auto b     = prog.insert_instruction(ins, op::add{}, wb, rb);
-        bias       = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, b);
+        bias       = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape().lens()}, b);
    }

    instruction_ref hidden_out = prog.end();
@@ -517,25 +521,26 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
    instruction_ref brcst_bh{};
    if(bias != prog.end())
    {
-        auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
-        auto wbz   = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
-        auto wbr   = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
-        auto wbh   = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sbias);
-        brcst_wbh  = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, wbh);
+        auto broadcast_lens = sih->get_shape().lens();
+        auto sbias          = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
+        auto wbz            = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
+        auto wbr            = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
+        auto wbh  = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sbias);
+        brcst_wbh = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, wbh);

        auto rbz  = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sbias);
        auto rbr  = prog.insert_instruction(ins, op::slice{{0}, {4 * hs}, {5 * hs}}, sbias);
        auto rbh  = prog.insert_instruction(ins, op::slice{{0}, {5 * hs}, {6 * hs}}, sbias);
-        brcst_rbh = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, rbh);
+        brcst_rbh = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, rbh);

        auto bz  = prog.insert_instruction(ins, op::add{}, wbz, rbz);
-        brcst_bz = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, bz);
+        brcst_bz = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, bz);

        auto br  = prog.insert_instruction(ins, op::add{}, wbr, rbr);
-        brcst_br = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, br);
+        brcst_br = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, br);

        auto bh  = prog.insert_instruction(ins, op::add{}, wbh, rbh);
-        brcst_bh = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, bh);
+        brcst_bh = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, bh);
    }

    for(long i = 0; i < seq_len; i++)
@@ -664,5 +669,514 @@ std::vector<operation> rewrite_rnn::gru_actv_funcs(instruction_ref ins) const
    }
 }

+// for lstm operators
+void rewrite_rnn::apply_lstm(program& prog, instruction_ref ins) const
+{
+    assert(ins->name() == "lstm");
+    auto args = ins->inputs();
+
+    shape seq_shape         = args[0]->get_shape();
+    std::size_t hidden_size = args[2]->get_shape().lens()[2];
+    std::size_t batch_size  = seq_shape.lens()[1];
+    shape::type_t type      = seq_shape.type();
+    migraphx::shape ihc_shape{type, {1, batch_size, hidden_size}};
+    std::vector<float> ihc_data(ihc_shape.elements(), 0.0);
+
+    migraphx::shape pph_shape{type, {1, 3 * hidden_size}};
+    std::vector<float> pph_data(pph_shape.elements(), 0.0);
+
+    auto actv_funcs         = lstm_actv_funcs(ins);
+    auto lstm_op            = any_cast<op::lstm>(ins->get_operator());
+    op::rnn_direction dirct = lstm_op.direction;
+
+    instruction_ref last_output{};
+    instruction_ref last_cell_output{};
+    if(dirct == op::rnn_direction::bidirectional)
+    {
+        // input weight matrix
+        // input weight matrix
+        auto w_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[1]);
+        auto w_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[1]);
+
+        // hidden state weight matrix
+        auto r_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[2]);
+        auto r_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[2]);
+
+        // process bias
+        instruction_ref bias_forward = prog.end();
+        instruction_ref bias_reverse = prog.end();
+        if(args.size() >= 4 && args[3]->name() != "undefined")
+        {
+            bias_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[3]);
+            bias_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[3]);
+        }
+
+        // process intial hidden state, it is the 6th argument
+        instruction_ref ih_forward{};
+        instruction_ref ih_reverse{};
+        if(args.size() >= 6 && args[5]->name() != "undefined")
+        {
+            ih_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[5]);
+            ih_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[5]);
+        }
+        else
+        {
+            ih_forward = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
+            ih_reverse = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
+        }
+
+        // process initial cell value
+        instruction_ref ic_forward{};
+        instruction_ref ic_reverse{};
+        if(args.size() >= 7 && args[6]->name() != "undefined")
+        {
+            ic_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[6]);
+            ic_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[6]);
+        }
+        else
+        {
+            ic_forward = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
+            ic_reverse = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
+        }
+
+        // process weight of the peephole
+        instruction_ref pph_forward = prog.end();
+        instruction_ref pph_reverse = prog.end();
+        if(args.size() == 8 && args[7]->name() != "undefined")
+        {
+            pph_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[7]);
+            pph_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[7]);
+        }
+
+        auto ret_forward = lstm_cell(
+            true,
+            prog,
+            ins,
+            {args[0], w_forward, r_forward, bias_forward, ih_forward, ic_forward, pph_forward},
+            actv_funcs.at(0),
+            actv_funcs.at(1),
+            actv_funcs.at(2));
+
+        auto ret_reverse = lstm_cell(
+            false,
+            prog,
+            ins,
+            {args[0], w_reverse, r_reverse, bias_reverse, ih_reverse, ic_reverse, pph_reverse},
+            actv_funcs.at(3),
+            actv_funcs.at(4),
+            actv_funcs.at(5));
+
+        auto concat_output =
+            prog.insert_instruction(ins, op::concat{1}, ret_forward[1], ret_reverse[1]);
+        last_output = prog.insert_instruction(ins, op::squeeze{{0}}, concat_output);
+
+        // last cell output
+        last_cell_output =
+            prog.insert_instruction(ins, op::concat{0}, ret_forward[2], ret_reverse[2]);
+
+        // the following logic is to ensure the last instruction is a concat
+        if(ret_forward[0] == prog.end())
+        {
+            prog.replace_instruction(ins, op::concat{1}, ret_forward[1], ret_reverse[1]);
+        }
+        else
+        {
+            ret_forward[0] =
+                prog.insert_instruction(ins, op::concat{0}, ret_forward[0], ret_forward[1]);
+            ret_reverse[0] =
+                prog.insert_instruction(ins, op::concat{0}, ret_reverse[1], ret_reverse[0]);
+            prog.replace_instruction(ins, op::concat{1}, {ret_forward[0], ret_reverse[0]});
+        }
+    }
+    else
+    {
+        bool is_forward = (dirct == op::rnn_direction::forward);
+        // weight matrices
+        auto w = args[1];
+        auto r = args[2];
+
+        // bias
+        instruction_ref bias = prog.end();
+        if(args.size() >= 4 && args[3]->name() != "undefined")
+        {
+            bias = args[3];
+        }
+
+        // initial hidden state
+        instruction_ref ih{};
+        if(args.size() >= 6 && args[5]->name() != "undefined")
+        {
+            ih = args[5];
+        }
+        else
+        {
+            ih = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
+        }
+
+        // initial cell value
+        instruction_ref ic{};
+        if(args.size() >= 7 && args[6]->name() != "undefined")
+        {
+            ic = args[6];
+        }
+        else
+        {
+            ic = prog.add_literal(migraphx::literal{ihc_shape, ihc_data});
+        }
+
+        // process weight of the peephole
+        instruction_ref pph = prog.end();
+        if(args.size() == 8 && args[7]->name() != "undefined")
+        {
+            pph = args[7];
+        }
+
+        auto ret = lstm_cell(is_forward,
+                             prog,
+                             ins,
+                             {args[0], w, r, bias, ih, ic, pph},
+                             actv_funcs.at(0),
+                             actv_funcs.at(1),
+                             actv_funcs.at(2));
+
+        last_output      = prog.insert_instruction(ins, op::squeeze{{0}}, ret[1]);
+        last_cell_output = ret[2];
+        if(ret[0] == prog.end())
+        {
+            prog.replace_instruction(ins, op::concat{0}, ret[1]);
+        }
+        else
+        {
+            auto concat_arg0 = is_forward ? ret[0] : ret[1];
+            auto concat_arg1 = is_forward ? ret[1] : ret[0];
+            prog.replace_instruction(ins, op::concat{0}, concat_arg0, concat_arg1);
+        }
+    }
+
+    // replace the corresponding lstm_last_output instruction
+    // with the last_output, and the lstm_last_cell_output with
+    // the last_cell_output. The while loop is to handle the case
+    // of multiple lstm_last_output and lstm_last_cell_output
+    // operators
+    auto last_output_it = ins->outputs().begin();
+    while(last_output_it != ins->outputs().end())
+    {
+        last_output_it = std::find_if(last_output_it, ins->outputs().end(), [](auto i) {
+            return i->name() == "rnn_last_output";
+        });
+
+        if(last_output_it != ins->outputs().end())
+        {
+            prog.replace_instruction(*last_output_it, last_output);
+            last_output_it++;
+        }
+    }
+
+    auto last_cell_output_it = ins->outputs().begin();
+    while(last_cell_output_it != ins->outputs().end())
+    {
+        last_cell_output_it = std::find_if(last_cell_output_it, ins->outputs().end(), [](auto i) {
+            return i->name() == "lstm_last_cell_output";
+        });
+
+        if(last_cell_output_it != ins->outputs().end())
+        {
+            prog.replace_instruction(*last_cell_output_it, last_cell_output);
+            last_cell_output_it++;
+        }
+    }
+}
+
+std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
+                                                    program& prog,
+                                                    instruction_ref ins,
+                                                    std::vector<instruction_ref> inputs,
+                                                    const operation& actv_func1,
+                                                    const operation& actv_func2,
+                                                    const operation& actv_func3) const
+{
+    // must have 7 args in the input vector
+    assert(inputs.size() == 7);
+    auto seq  = inputs.at(0);
+    auto w    = inputs.at(1);
+    auto r    = inputs.at(2);
+    auto bias = inputs.at(3);
+    auto ih   = inputs.at(4);
+    auto ic   = inputs.at(5);
+    auto pph  = inputs.at(6);
+
+    instruction_ref hidden_states = prog.end();
+    instruction_ref last_output{};
+    instruction_ref last_cell_output{};
+
+    migraphx::shape seq_shape = seq->get_shape();
+    migraphx::shape r_shape   = r->get_shape();
+    long seq_len              = static_cast<long>(seq_shape.lens()[0]);
+    long hs                   = static_cast<long>(r_shape.lens()[2]);
+
+    std::vector<int64_t> perm{1, 0};
+    // w matrix
+    auto sw      = prog.insert_instruction(ins, op::squeeze{{0}}, w);
+    auto wi      = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sw);
+    auto tran_wi = prog.insert_instruction(ins, op::transpose{perm}, wi);
+
+    auto wo      = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sw);
+    auto tran_wo = prog.insert_instruction(ins, op::transpose{perm}, wo);
+
+    auto wf      = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sw);
+    auto tran_wf = prog.insert_instruction(ins, op::transpose{perm}, wf);
+
+    auto wc      = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sw);
+    auto tran_wc = prog.insert_instruction(ins, op::transpose{perm}, wc);
+
+    // r matrix
+    auto sr      = prog.insert_instruction(ins, op::squeeze{{0}}, r);
+    auto ri      = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sr);
+    auto tran_ri = prog.insert_instruction(ins, op::transpose{perm}, ri);
+
+    auto ro      = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sr);
+    auto tran_ro = prog.insert_instruction(ins, op::transpose{perm}, ro);
+
+    auto rf      = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sr);
+    auto tran_rf = prog.insert_instruction(ins, op::transpose{perm}, rf);
+
+    auto rc      = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sr);
+    auto tran_rc = prog.insert_instruction(ins, op::transpose{perm}, rc);
+
+    // initial hidden state
+    auto sih = prog.insert_instruction(ins, op::squeeze{{0}}, ih);
+
+    // initial cell state
+    auto sic     = prog.insert_instruction(ins, op::squeeze{{0}}, ic);
+    auto ic_lens = sic->get_shape().lens();
+
+    // bias
+    instruction_ref bi_brcst{};
+    instruction_ref bo_brcst{};
+    instruction_ref bf_brcst{};
+    instruction_ref bc_brcst{};
+    if(bias != prog.end())
+    {
+
+        auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
+        auto bxi   = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
+        auto bhi   = prog.insert_instruction(ins, op::slice{{0}, {4 * hs}, {5 * hs}}, sbias);
+        auto bi    = prog.insert_instruction(ins, op::add{}, bxi, bhi);
+        bi_brcst   = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bi);
+
+        auto bxo = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
+        auto bho = prog.insert_instruction(ins, op::slice{{0}, {5 * hs}, {6 * hs}}, sbias);
+        auto bo  = prog.insert_instruction(ins, op::add{}, bxo, bho);
+        bo_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bo);
+
+        auto bxf = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sbias);
+        auto bhf = prog.insert_instruction(ins, op::slice{{0}, {6 * hs}, {7 * hs}}, sbias);
+        auto bf  = prog.insert_instruction(ins, op::add{}, bxf, bhf);
+        bf_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bf);
+
+        auto bxc = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sbias);
+        auto bhc = prog.insert_instruction(ins, op::slice{{0}, {7 * hs}, {8 * hs}}, sbias);
+        auto bc  = prog.insert_instruction(ins, op::add{}, bxc, bhc);
+        bc_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bc);
+    }
+
+    // peep hole
+    instruction_ref pphi_brcst{};
+    instruction_ref ppho_brcst{};
+    instruction_ref pphf_brcst{};
+
+    if(pph != prog.end())
+    {
+        auto spph  = prog.insert_instruction(ins, op::squeeze{{0}}, pph);
+        auto pphi  = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, spph);
+        pphi_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, pphi);
+
+        auto ppho  = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, spph);
+        ppho_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, ppho);
+
+        auto pphf  = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, spph);
+        pphf_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, pphf);
+    }
+
+    for(long i = 0; i < seq_len; ++i)
+    {
+        long seq_index = is_forward ? i : (seq_len - 1 - i);
+        auto xt = prog.insert_instruction(ins, op::slice{{0}, {seq_index}, {seq_index + 1}}, seq);
+        xt      = prog.insert_instruction(ins, op::squeeze{{0}}, xt);
+
+        // equation it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
+        auto xt_wi          = prog.insert_instruction(ins, op::dot{}, xt, tran_wi);
+        auto ht_ri          = prog.insert_instruction(ins, op::dot{}, sih, tran_ri);
+        auto it_before_actv = prog.insert_instruction(ins, op::add{}, xt_wi, ht_ri);
+        if(pph != prog.end())
+        {
+            auto pphi_ct   = prog.insert_instruction(ins, op::mul{}, pphi_brcst, sic);
+            it_before_actv = prog.insert_instruction(ins, op::add{}, it_before_actv, pphi_ct);
+        }
+        if(bias != prog.end())
+        {
+            it_before_actv = prog.insert_instruction(ins, op::add{}, it_before_actv, bi_brcst);
+        }
+        auto it = prog.insert_instruction(ins, actv_func1, it_before_actv);
+
+        // equation ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
+        auto xt_wf          = prog.insert_instruction(ins, op::dot{}, xt, tran_wf);
+        auto ht_rf          = prog.insert_instruction(ins, op::dot{}, sih, tran_rf);
+        auto ft_before_actv = prog.insert_instruction(ins, op::add{}, xt_wf, ht_rf);
+        if(pph != prog.end())
+        {
+            auto pphf_ct   = prog.insert_instruction(ins, op::mul{}, pphf_brcst, sic);
+            ft_before_actv = prog.insert_instruction(ins, op::add{}, ft_before_actv, pphf_ct);
+        }
+        if(bias != prog.end())
+        {
+            ft_before_actv = prog.insert_instruction(ins, op::add{}, ft_before_actv, bf_brcst);
+        }
+        auto ft = prog.insert_instruction(ins, actv_func1, ft_before_actv);
+
+        // equation ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
+        auto xt_wc          = prog.insert_instruction(ins, op::dot{}, xt, tran_wc);
+        auto ht_rc          = prog.insert_instruction(ins, op::dot{}, sih, tran_rc);
+        auto ct_before_actv = prog.insert_instruction(ins, op::add{}, xt_wc, ht_rc);
+        if(bias != prog.end())
+        {
+            ct_before_actv = prog.insert_instruction(ins, op::add{}, ct_before_actv, bc_brcst);
+        }
+        auto ct = prog.insert_instruction(ins, actv_func2, ct_before_actv);
+
+        // equation Ct = ft (.) Ct-1 + it (.) ct
+        auto ft_cell     = prog.insert_instruction(ins, op::mul{}, ft, sic);
+        auto it_ct       = prog.insert_instruction(ins, op::mul{}, it, ct);
+        auto cellt       = prog.insert_instruction(ins, op::add{}, ft_cell, it_ct);
+        last_cell_output = cellt;
+
+        // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
+        auto xt_wo          = prog.insert_instruction(ins, op::dot{}, xt, tran_wo);
+        auto ht_ro          = prog.insert_instruction(ins, op::dot{}, sih, tran_ro);
+        auto ot_before_actv = prog.insert_instruction(ins, op::add{}, xt_wo, ht_ro);
+        if(pph != prog.end())
+        {
+            auto ppho_cellt = prog.insert_instruction(ins, op::mul{}, ppho_brcst, cellt);
+            ot_before_actv  = prog.insert_instruction(ins, op::add{}, ot_before_actv, ppho_cellt);
+        }
+        if(bias != prog.end())
+        {
+            ot_before_actv = prog.insert_instruction(ins, op::add{}, ot_before_actv, bo_brcst);
+        }
+        auto ot = prog.insert_instruction(ins, actv_func1, ot_before_actv);
+
+        // Ht = ot (.) h(Ct)
+        auto h_cellt = prog.insert_instruction(ins, actv_func3, cellt);
+        auto ht      = prog.insert_instruction(ins, op::mul{}, ot, h_cellt);
+
+        sic = cellt;
+        sih = ht;
+
+        last_output = prog.insert_instruction(ins, op::unsqueeze{{0, 1}}, ht);
+
+        if(i < seq_len - 1)
+        {
+            if(i == 0)
+            {
+                hidden_states = last_output;
+            }
+            else
+            {
+                auto concat_arg0 = is_forward ? hidden_states : last_output;
+                auto concat_arg1 = is_forward ? last_output : hidden_states;
+                hidden_states =
+                    prog.insert_instruction(ins, op::concat{0}, concat_arg0, concat_arg1);
+            }
+        }
+    }
+
+    last_cell_output = prog.insert_instruction(ins, op::unsqueeze{{0}}, last_cell_output);
+
+    return {hidden_states, last_output, last_cell_output};
+}
+
+std::vector<operation> rewrite_rnn::lstm_actv_funcs(instruction_ref ins) const
+{
+    auto lstm_op = any_cast<op::lstm>(ins->get_operator());
+    // before rewrite the lstm operator, need to ensure
+    // we have 6 actv funcs, even though a user does not
+    // specifiy any actv func. If less than 46, use the
+    // algorithm in parse_lstm to make 6 actv functions
+    const auto& actv_funcs     = lstm_op.actv_funcs;
+    std::size_t num_actv_funcs = actv_funcs.size();
+    if(lstm_op.direction == op::rnn_direction::bidirectional)
+    {
+        switch(num_actv_funcs)
+        {
+        case 0:
+            return {op::sigmoid{}, op::tanh{}, op::tanh{}, op::sigmoid{}, op::tanh{}, op::tanh{}};
+
+        case 1:
+            return {actv_funcs.at(0),
+                    actv_funcs.at(0),
+                    actv_funcs.at(0),
+                    actv_funcs.at(0),
+                    actv_funcs.at(0),
+                    actv_funcs.at(0)};
+
+        case 2:
+            return {actv_funcs.at(0),
+                    actv_funcs.at(1),
+                    actv_funcs.at(1),
+                    actv_funcs.at(0),
+                    actv_funcs.at(1),
+                    actv_funcs.at(1)};
+
+        case 3:
+            return {actv_funcs.at(0),
+                    actv_funcs.at(1),
+                    actv_funcs.at(2),
+                    actv_funcs.at(0),
+                    actv_funcs.at(1),
+                    actv_funcs.at(2)};
+
+        case 4:
+            return {actv_funcs.at(0),
+                    actv_funcs.at(1),
+                    actv_funcs.at(2),
+                    actv_funcs.at(3),
+                    actv_funcs.at(3),
+                    actv_funcs.at(3)};
+
+        case 5:
+            return {actv_funcs.at(0),
+                    actv_funcs.at(1),
+                    actv_funcs.at(2),
+                    actv_funcs.at(3),
+                    actv_funcs.at(4),
+                    actv_funcs.at(4)};
+
+        default: return actv_funcs;
+        }
+    }
+    else
+    {
+        switch(num_actv_funcs)
+        {
+        case 0: return {op::sigmoid{}, op::tanh{}, op::tanh{}};
+
+        case 1: return {actv_funcs.at(0), actv_funcs.at(0), actv_funcs.at(0)};
+
+        case 2: return {actv_funcs.at(0), actv_funcs.at(1), actv_funcs.at(1)};
+
+        default: return actv_funcs;
+        }
+    }
+}
+
+namespace op {
+std::ostream& operator<<(std::ostream& os, rnn_direction v)
+{
+    std::vector<std::string> rnn_direction_str = {"forward", "reverse", "bidirectional"};
+    os << rnn_direction_str[static_cast<std::underlying_type<rnn_direction>::type>(v)];
+    return os;
+}
+} // namespace op
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/schedule.cpp
+++ b/src/schedule.cpp
+#include <migraphx/schedule.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/functional.hpp>
+#include <migraphx/ranges.hpp>
+#include <unordered_map>
+#include <unordered_set>
+#include <set>
+#include <deque>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+auto get_inputs()
+{
+    return [](auto i) { return i->inputs(); };
+}
+
+auto get_outputs()
+{
+    return [](auto i) { return i->outputs(); };
+}
+
+struct stream_info
+{
+    std::unordered_map<instruction_ref, std::size_t> ins2stream;
+    std::unordered_map<instruction_ref, std::size_t> weights;
+    std::unordered_map<instruction_ref, std::size_t> iweights;
+
+    void accumulate_weights(instruction_ref last, const schedule_model& model)
+    {
+        fix<std::size_t>([&](auto self, auto ins) -> std::size_t {
+            if(not contains(weights, ins))
+            {
+                std::size_t weight = 0;
+                auto&& op          = ins->get_operator();
+                if(not is_context_free(op) and op.name()[0] != '@')
+                    weight = model.weight(op);
+                iweights[ins] = weight;
+                weights[ins] =
+                    std::accumulate(ins->inputs().begin(),
+                                    ins->inputs().end(),
+                                    weight,
+                                    [&](std::size_t w, instruction_ref i) { return w + self(i); });
+            }
+            return weights[ins];
+        })(last);
+    }
+
+    std::vector<instruction_ref>::iterator sort_args(std::vector<instruction_ref>& args)
+    {
+        if(args.size() < 2)
+        {
+            return args.end();
+        }
+
+        const std::size_t min_partition_threshold = 2;
+        auto compare                              = by(std::greater<>{}, [&](auto x) {
+            return std::make_tuple(this->weights[x], x->inputs().size());
+        });
+        std::sort(args.begin(), args.end(), compare);
+
+        auto it = std::lower_bound(std::next(args.begin()),
+                                   args.end(),
+                                   min_partition_threshold,
+                                   [&](auto i, std::size_t w) { return this->weights[i] > w; });
+        assert(it == args.end() or this->weights[*it] <= min_partition_threshold);
+        assert(it == args.end() or std::prev(it) == args.begin() or
+               this->weights[*std::prev(it)] > min_partition_threshold);
+        return it;
+    }
+
+    struct partition
+    {
+        std::size_t weight = 0;
+        std::vector<instruction_ref> instructions{};
+
+        void add(instruction_ref ins, std::size_t w)
+        {
+            weight += w;
+            instructions.push_back(ins);
+        }
+    };
+
+    void assign_streams(program& p, std::size_t n)
+    {
+        partition critical;
+        std::unordered_map<instruction_ref, std::deque<partition>> partitions;
+        partitions.reserve(weights.size());
+        fix([&](auto self, auto ins, auto& part) {
+            assert(ins != p.end());
+            if(contains(partitions, ins))
+                return;
+            assert(p.has_instruction(ins));
+            // Add an entry so we know the instruction was visited
+            partitions[ins];
+            part.add(ins, this->iweights[ins]);
+
+            auto args         = ins->inputs();
+            auto threshold_it = this->sort_args(args);
+
+            if(not args.empty())
+            {
+                assert(threshold_it != args.begin());
+                self(args.front(), part);
+                for(auto i : range(std::next(args.begin()), threshold_it))
+                {
+                    partitions[ins].emplace_back();
+                    self(i, partitions[ins].back());
+                }
+                for(auto i : range(threshold_it, args.end()))
+                {
+                    self(i, part);
+                }
+            }
+            // Sort instructions
+            p.move_instruction(ins, p.end());
+        })(std::prev(p.end()), critical);
+
+        // Set the critical partition to stream 0
+        set_stream(critical, 0);
+        std::vector<std::size_t> streams(n - 1);
+        // Assign streams for the other partitions
+        for(auto&& ins_part : partitions)
+        {
+            std::sort(
+                ins_part.second.begin(), ins_part.second.end(), by(std::greater<>{}, [](auto&& x) {
+                    return std::make_tuple(x.weight, x.instructions.size());
+                }));
+            for(auto&& part : ins_part.second)
+            {
+                auto stream = std::min_element(streams.begin(), streams.end()) - streams.begin();
+                set_stream(part, stream + 1);
+                streams[stream] += part.weight;
+            }
+        }
+    }
+
+    void set_stream(const partition& p, std::size_t n)
+    {
+        for(auto ins : p.instructions)
+            if(iweights[ins] > 0)
+                set_stream(ins, n);
+    }
+
+    void set_stream(instruction_ref ins, std::size_t n)
+    {
+        assert(iweights[ins] > 0);
+        ins2stream[ins] = n;
+    }
+
+    std::size_t get_stream(instruction_ref ins) const { return ins2stream.at(ins); }
+
+    bool has_stream(instruction_ref ins) const { return contains(ins2stream, ins); }
+
+    template <class F>
+    bool different(F f, std::size_t stream) const
+    {
+        bool result = false;
+        f([&](auto s) {
+            if(s != stream)
+            {
+                result = true;
+                return false;
+            }
+            // cppcheck-suppress uselessAssignmentArg
+            stream = s;
+            return true;
+        });
+        return result;
+    }
+
+    template <class F>
+    bool different(F f) const
+    {
+        bool result = false;
+        f([&](auto s) {
+            result = this->different(f, s);
+            return false;
+        });
+        return result;
+    }
+
+    template <class Selector>
+    auto get_streams_from(instruction_ref start, Selector select) const
+    {
+        return [=](auto f) {
+            return fix<bool>([&](auto self, auto ins) {
+                for(auto i : select(ins))
+                {
+                    if(iweights.at(i) == 0)
+                    {
+                        if(not self(i))
+                            return false;
+                    }
+                    else
+                    {
+                        if(not f(this->get_stream(i)))
+                            return false;
+                    }
+                }
+                return true;
+            })(start);
+        };
+    }
+
+    std::unordered_set<std::size_t> get_streams(instruction_ref ins) const
+    {
+        if(has_stream(ins))
+            return {get_stream(ins)};
+        std::unordered_set<std::size_t> result;
+        get_streams_from(ins, get_inputs())([&](auto s) {
+            result.insert(s);
+            return true;
+        });
+        return result;
+    }
+
+    template <class... Ts>
+    bool is_merge_point(instruction_ref ins, Ts... xs) const
+    {
+        return different(get_streams_from(ins, get_inputs()), xs...);
+    }
+
+    template <class... Ts>
+    bool is_split_point(instruction_ref ins, Ts... xs) const
+    {
+        return different(get_streams_from(ins, get_outputs()), xs...);
+    }
+
+    std::vector<instruction_ref> get_recorded_instructions(instruction_ref start)
+    {
+        std::vector<instruction_ref> result;
+        std::unordered_map<std::size_t, instruction_ref> m;
+        fix([&](auto self, auto ins) {
+            for(auto i : ins->inputs())
+            {
+                if(iweights.at(i) == 0)
+                {
+                    self(i);
+                    continue;
+                }
+                auto stream = this->get_stream(i);
+                if(not contains(m, stream))
+                    m[stream] = i;
+                else
+                    m[stream] = std::min(m[stream], i, by(std::less<>{}, [&](auto x) {
+                                             return std::distance(x, start);
+                                         }));
+            }
+        })(start);
+        std::transform(
+            m.begin(), m.end(), std::back_inserter(result), [](auto&& p) { return p.second; });
+        return result;
+    }
+
+    std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>>
+    find_concurrent_instructions(program& p)
+    {
+        std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>> result;
+        std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> merge_from;
+        result.reserve(p.size());
+        merge_from.reserve(p.size());
+        for(auto ins : reverse_iterator_for(p))
+        {
+            for(auto&& arg : ins->outputs())
+            {
+                if(is_merge_point(arg))
+                    merge_from[ins].insert(arg);
+                merge_from[ins].insert(merge_from[arg].begin(), merge_from[arg].end());
+            }
+
+            auto streams = this->get_streams(ins);
+
+            // Collect concur instructions for each merge point.
+            for(auto& merge : merge_from[ins])
+            {
+                for(auto stream : streams)
+                {
+                    if(result[merge].size() <= stream)
+                        result[merge].resize(stream + 1);
+                    auto&& r = result[merge][stream];
+                    r.push_back(ins);
+                    // Copy inputs if they dont have a stream(and are not a builtin and context
+                    // free). Inputs without a stream can have a implicit dependency
+                    std::copy_if(ins->inputs().begin(),
+                                 ins->inputs().end(),
+                                 std::back_inserter(r),
+                                 [&](auto x) {
+                                     return not this->has_stream(x) and
+                                            not is_context_free(x->get_operator()) and
+                                            x->name().front() != '@';
+                                 });
+                }
+            }
+        }
+        return result;
+    }
+
+    std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>>
+    get_conflicts(program& p)
+    {
+        std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> conflict_table;
+        auto concur_ins = this->find_concurrent_instructions(p);
+        for(auto&& merge : concur_ins)
+        {
+            dfor(merge.second.size(), merge.second.size())([&](auto i, auto j) {
+                if(i == j)
+                    return;
+                for(auto ins1 : merge.second[i])
+                {
+                    auto p1 = std::distance(ins1, merge.first);
+                    for(auto ins2 : merge.second[j])
+                    {
+                        if(ins1 == ins2)
+                            continue;
+                        auto p2 = std::distance(ins2, merge.first);
+                        // The smaller distance means the instruction occurs later
+                        if(p1 > p2)
+                            conflict_table[ins2].insert(ins1);
+                        else
+                            conflict_table[ins1].insert(ins2);
+                    }
+                }
+            });
+        }
+        // Remove duplicates
+        for(auto&& ip : conflict_table)
+        {
+            auto ins1 = ip.first;
+            for(auto ins2 : ip.second)
+                if(contains(conflict_table[ins2], ins1))
+                    conflict_table[ins2].erase(ins1);
+        }
+        return conflict_table;
+    }
+};
+
+void schedule::apply(program& p) const
+{
+    if(not enable)
+        return;
+    stream_info si;
+    auto last = std::prev(p.end());
+    si.accumulate_weights(last, model);
+    si.assign_streams(p, model.concurrency());
+
+    if(enabled(MIGRAPHX_TRACE_COMPILE{}))
+    {
+        p.annotate(std::cout, [&](auto ins) {
+            std::cout << ":";
+            std::cout << " weight=" << si.weights.at(ins);
+            std::cout << " input={";
+            si.get_streams_from(ins, get_inputs())([&](auto s) {
+                std::cout << s << ",";
+                return true;
+            });
+            std::cout << "}";
+            if(si.has_stream(ins))
+                std::cout << " stream=" << si.get_stream(ins);
+        });
+        std::cout << std::endl;
+    }
+
+    // Schedule instructions
+    std::size_t wait_id = 0;
+    std::unordered_map<instruction_ref, std::size_t> ins2wait;
+    std::unordered_map<std::size_t, std::unordered_set<std::size_t>> waited_for;
+    std::unordered_map<instruction_ref, std::unordered_set<std::size_t>> ins2waited;
+    ins2wait.reserve(p.size());
+    ins2waited.reserve(p.size());
+    for(auto ins : iterator_for(p))
+    {
+        // Only schedule instructions that have a stream
+        if(not si.has_stream(ins))
+            continue;
+        assert(si.weights[ins] > 0);
+        // Schedule instruction on the stream
+        auto stream = si.get_stream(ins);
+        assert(stream < model.concurrency());
+        model.sched(p, ins, stream);
+        // Insert wait instructions
+        if(si.is_merge_point(ins, stream))
+        {
+            for(auto i : si.get_recorded_instructions(ins))
+            {
+                if(not si.has_stream(i))
+                    continue;
+                auto istream = si.get_stream(i);
+                if(stream == istream)
+                    continue;
+                // Create a new event if it hasn't been recorded
+                if(not contains(ins2wait, i))
+                {
+                    ins2wait[i] = wait_id;
+                    model.record(p, i, wait_id);
+                    wait_id++;
+                }
+                auto w = ins2wait.at(i);
+                // If we already waited for the event on this stream then dont
+                // insert another wait event
+                if(not contains(waited_for[stream], w))
+                    model.wait(p, ins, w);
+                // Store the event as waited
+                waited_for[stream].insert(w);
+                // Store all wait events that have been waited on prior to the recorded instruction
+                waited_for[stream].insert(ins2waited[i].begin(), ins2waited[i].end());
+            }
+        }
+        // Store wait events that have already been waited on
+        if(si.is_split_point(ins, stream))
+        {
+            ins2waited[ins] = waited_for[stream];
+        }
+    }
+
+    // Add memory conflicts
+    auto conflict_table = si.get_conflicts(p);
+    for(auto&& ip : conflict_table)
+    {
+        if(ip.second.empty())
+            continue;
+        std::vector<instruction_ref> args;
+        args.push_back(ip.first);
+        args.insert(args.end(), ip.second.begin(), ip.second.end());
+        p.insert_instruction(std::next(ip.first), op::identity{}, args);
+    }
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -19,7 +19,7 @@ struct shape_impl

    shape_impl() : m_type(shape::float_type), m_standard(false) {}

-    shape_impl(shape::type_t t) : m_type(t), m_lens({1}), m_strides({1}), m_standard(true) {}
+    shape_impl(shape::type_t t) : m_type(t), m_lens({1}), m_strides({0}), m_standard(true) {}
    shape_impl(shape::type_t t, std::vector<std::size_t> l)
        : m_type(t), m_lens(std::move(l)), m_standard(true)
    {

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/program.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/add.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/literal.hpp>


--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/as_shape.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/ranges.hpp>
 #include <unordered_set>
@@ -14,7 +14,9 @@ bool is_reshaper(instruction_ref ins)
    // clang-format off
    static const std::unordered_set<std::string> names = {
        "reshape",
-        "contiguous"
+        "contiguous",
+        "squeeze",
+        "unsqueeze"
    };
    // clang-format on
    return contains(names, ins->name());
@@ -45,6 +47,9 @@ void simplify_reshapes::apply(program& p) const
    auto end = std::prev(p.end());
    for(auto ins : iterator_for(p))
    {
+        if(ins == end and ins->name() == "contiguous")
+            continue;
+        // Skip possible dead instructions
        if(ins->outputs().empty() and ins != end)
            continue;
        if(is_reshaper(ins))
@@ -94,13 +99,6 @@ void simplify_reshapes::apply(program& p) const
            p.replace_instruction(ins, t->inputs().front());
        }
    }
-    // Replace all reshapes with as_shape
-    for(auto ins : iterator_for(p))
-    {
-        if(ins->name() != "reshape")
-            continue;
-        p.replace_instruction(ins, op::as_shape{ins->get_shape()}, ins->inputs());
-    }
 }

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
 #include <migraphx/cpu/gemm.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/requires.hpp>
+#include <migraphx/shape_for_each.hpp>
 #include <blaze/math/CustomMatrix.h>

 namespace migraphx {
@@ -14,10 +15,13 @@ template <class T>
 static auto make_mat(tensor_view<T> x)
 {
    const auto& s = x.get_shape();
-    assert(s.lens().size() == 2);
+    // assert(s.lens().size() == 2);
+    std::size_t n_dims = s.lens().size();
+    std::size_t dim_0  = n_dims - 2;
+    std::size_t dim_1  = n_dims - 1;
    if(s.transposed())
-        return matrix<T>{x.data(), s.lens()[1], s.lens()[0], s.strides()[1]};
-    return matrix<T>{x.data(), s.lens()[0], s.lens()[1], s.strides()[0]};
+        return matrix<T>{x.data(), s.lens()[dim_1], s.lens()[dim_0], s.strides()[dim_1]};
+    return matrix<T>{x.data(), s.lens()[dim_0], s.lens()[dim_1], s.strides()[dim_0]};
 }

 template <class T, class F>
@@ -51,7 +55,13 @@ void migemm_impl(tensor_view<T> cmat,
    visit_mat(amat, [&](const auto& a) {
        visit_mat(bmat, [&](const auto& b) {
            auto c = make_mat(cmat);
-            c      = (a * b) * alpha + beta * c;
+            c      = beta * c;
+            // This is a simple optimization to avoid
+            // compute A * B if alpha is 0.0
+            if(alpha != 0.0)
+            {
+                c = c + alpha * a * b;
+            }
        });
    });
 }
@@ -64,18 +74,24 @@ void migemm_impl(tensor_view<T> cmat,
                 float beta,
                 std::false_type)
 {
-    auto m = cmat.get_shape().lens()[0];
-    auto n = cmat.get_shape().lens()[1];
-    auto k = amat.get_shape().lens()[1];
+    std::size_t n_dims = cmat.get_shape().lens().size();
+    std::size_t dim_0  = n_dims - 2;
+    std::size_t dim_1  = n_dims - 1;
+    auto k             = amat.get_shape().lens()[dim_1];

-    assert(amat.get_shape().lens()[1] == bmat.get_shape().lens()[0]);
-    assert(m == amat.get_shape().lens()[0]);
-    assert(n == bmat.get_shape().lens()[1]);
+    assert(amat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_0]);
+    assert(cmat.get_shape().lens()[dim_0] == amat.get_shape().lens()[dim_0]);
+    assert(cmat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_1]);

-    dfor(m, n)([&](auto ii, auto jj) {
-        double s = cmat(ii, jj) * beta;
-        dfor(k)([&](auto kk) { s += amat(ii, kk) * bmat(kk, jj); });
-        cmat(ii, jj) = alpha * s;
+    shape_for_each(cmat.get_shape(), [&](const auto& c_idx) {
+        auto a_idx = c_idx;
+        auto b_idx = c_idx;
+        double s   = 0.0;
+        dfor(k)([&](auto kk) {
+            a_idx[dim_1] = b_idx[dim_0] = kk;
+            s += amat(a_idx.begin(), a_idx.end()) * bmat(b_idx.begin(), b_idx.end());
+        });
+        cmat(c_idx.begin(), c_idx.end()) = alpha * s + cmat(c_idx.begin(), c_idx.end()) * beta;
    });
 }

@@ -83,7 +99,18 @@ template <class T>
 void migemm_impl(
    tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, float alpha, float beta)
 {
-    migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{});
+    auto lens = amat.get_shape().lens();
+    bool batch_mul =
+        std::accumulate(
+            lens.rbegin() + 2, lens.rend(), std::size_t{1}, std::multiplies<std::size_t>()) == 1;
+    if(batch_mul)
+    {
+        migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{});
+    }
+    else
+    {
+        migemm_impl(cmat, amat, bmat, alpha, beta, std::false_type{});
+    }
 }

 void migemm(

--- a/src/targets/cpu/include/migraphx/cpu/target.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/target.hpp
@@ -7,6 +7,7 @@

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+struct pass;
 namespace cpu {

 struct target

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -48,6 +48,12 @@ struct cpu_batch_norm_inference
 {
    op::batch_norm_inference op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::batch_norm_inference"; }

    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
@@ -75,10 +81,10 @@ struct cpu_batch_norm_inference

                    par_dfor(num_batch, num_channels, image_height, image_width)(
                        [&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
-                            assert((variance(c) + epsilon) > 0);
-                            result(n, c, h, w) = gamma(c) * (buffer(n, c, h, w) - mean(c)) /
-                                                     std::sqrt(variance(c) + epsilon) +
-                                                 bias(c);
+                            assert((variance[c] + epsilon) > 0);
+                            result(n, c, h, w) = gamma[c] * (buffer(n, c, h, w) - mean[c]) /
+                                                     std::sqrt(variance[c] + epsilon) +
+                                                 bias[c];
                        });
                });
        }
@@ -107,6 +113,12 @@ struct cpu_lrn
 {
    op::lrn op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::lrn"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
@@ -117,7 +129,7 @@ struct cpu_lrn
            int channels        = output_shape.lens()[1];
            int height          = output_shape.lens()[2];
            int width           = output_shape.lens()[3];
-            float alphaoverarea = op.alpha / op.size;
+            float alphaoverarea = op.alpha / float(op.size);
            int radius          = (op.size - 1) / 2;

            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
@@ -144,6 +156,12 @@ struct cpu_convolution
 {
    op::convolution op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::convolution"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
@@ -165,15 +183,15 @@ struct cpu_convolution
                     output_shape.lens()[2],
                     output_shape.lens()[3])(
                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const int start_x  = i * op.stride[0] - op.padding[0];
-                    const int start_y  = j * op.stride[1] - op.padding[1];
-                    const int group_id = w / (wei_n / op.group);
+                    const auto start_x  = i * op.stride[0] - op.padding[0];
+                    const auto start_y  = j * op.stride[1] - op.padding[1];
+                    const auto group_id = w / (wei_n / op.group);

                    double acc = 0;
                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                        const int in_x  = start_x + x;
-                        const int in_y  = start_y + y;
-                        const int in_ch = group_id * wei_c + k;
+                        const auto in_x  = start_x + x;
+                        const auto in_y  = start_y + y;
+                        const auto in_ch = group_id * wei_c + k;
                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
                        {
                            acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
@@ -190,6 +208,12 @@ struct cpu_im2col
 {
    op::im2col op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    static std::string name() { return "cpu::im2col"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

@@ -209,10 +233,8 @@ struct cpu_im2col
            const std::size_t& stride_h = op.stride[0];
            const std::size_t& stride_w = op.stride[1];

-            int kdiv2_h;
-            int kdiv2_w;
-            kdiv2_h = kernel_h / 2;
-            kdiv2_w = kernel_w / 2;
+            auto kdiv2_h = kernel_h / 2;
+            auto kdiv2_w = kernel_w / 2;
            // calculate output sizes
            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
@@ -230,8 +252,8 @@ struct cpu_im2col
                    dfor(channels,
                         kernel_h,
                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
-                        int idx     = iinput + koffset - kdiv2_h;
-                        int jdx     = jinput + loffset - kdiv2_w;
+                        auto idx    = iinput + koffset - kdiv2_h;
+                        auto jdx    = jinput + loffset - kdiv2_w;
                        col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
                                          ? input(0, c, idx, jdx)
                                          : 0;
@@ -273,6 +295,12 @@ struct cpu_pooling
 {
    op::pooling op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::pooling_" + Op::name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
@@ -317,20 +345,35 @@ struct cpu_pooling
    }
 };

-struct cpu_contiguous
+struct cpu_op
 {
-    op::contiguous op;
-    std::string name() const { return "cpu::contiguous"; }
+    operation op;
+    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        return op.compute(output_shape, args);
+    }
+    friend bool operator==(const cpu_op& x, const cpu_op& y) { return x.op == y.op; }
+    friend bool operator==(const cpu_op& x, const operation& y)
    {
-        return op.compute(output_shape, std::move(args));
+        if(x.name() != y.name())
+            return false;
+        return x == any_cast<cpu_op>(y);
    }
+    friend bool operator==(const operation& x, const cpu_op& y) { return y == x; }
 };

 struct cpu_pad
 {
    op::pad op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
@@ -354,184 +397,54 @@ struct cpu_pad
    }
 };

-struct cpu_concat
-{
-    op::concat op;
-    std::string name() const { return "cpu::concat"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        return op.compute(output_shape, std::move(args));
-    }
-};
-
 struct cpu_gemm
 {
    op::dot op;
-    std::string name() const { return "cpu::dot"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
    {
-        argument result{output_shape};
-        migemm(result, args[0], args[1], op.alpha, op.beta);
-        return result;
+        return migraphx::reflect(self.op, f);
    }
-};
-
-struct cpu_gather
-{
-    op::gather op;
-    std::string name() const { return "cpu::gather"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        return op.compute(output_shape, std::move(args));
-    }
-};
-
-struct identity_op
-{
-    std::string name() const { return "cpu::identity"; }
-    auto fcn() const
-    {
-        return [](auto x) { return x; };
-    }
-};
-
-struct abs_op
-{
-    std::string name() const { return "cpu::abs"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::abs(make_signed(x)); };
-    }
-};
-
-struct exp_op
-{
-    std::string name() const { return "cpu::exp"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::exp(x); };
-    }
-};
-
-struct log_op
-{
-    std::string name() const { return "cpu::log"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::log(x); };
-    }
-};
-
-struct sin_op
-{
-    std::string name() const { return "cpu::sin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::sin(x); };
-    }
-};
-
-struct cos_op
-{
-    std::string name() const { return "cpu::cos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::cos(x); };
-    }
-};
-
-struct tan_op
-{
-    std::string name() const { return "cpu::tan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tan(x); };
-    }
-};
-
-struct asin_op
-{
-    std::string name() const { return "cpu::asin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::asin(x); };
-    }
-};
-
-struct acos_op
-{
-    std::string name() const { return "cpu::acos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::acos(x); };
-    }
-};
-
-struct atan_op
-{
-    std::string name() const { return "cpu::atan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::atan(x); };
-    }
-};
-
-struct sinh_op
-{
-    std::string name() const { return "cpu::sinh"; }
-    auto fcn() const
+    std::string name() const { return "cpu::dot"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
-        return [](auto x) { return std::sinh(x); };
+        if(inputs.size() == 3)
+        {
+            auto c_shape = inputs.at(2);
+            check_shapes{{c_shape}}.not_broadcasted();
+        }
+        return op.compute_shape(inputs);
    }
-};

-struct cosh_op
-{
-    std::string name() const { return "cpu::cosh"; }
-    auto fcn() const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
-        return [](auto x) { return std::cosh(x); };
-    }
-};
+        argument result{output_shape};
+        // 3 inputs, it is alpha * A * B + beta * C, then
+        // A and B are matrics, and C is broadcastable to A * B
+        if(args.size() == 3)
+        {
+            // no need to consider the value of args[2]
+            if(op.beta == 0.0f)
+            {
+                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
+            }
+            else
+            {
+                visit_all(result, args[2])([&](auto output, auto input) {
+                    std::copy(input.begin(), input.end(), output.begin());
+                });
+            }

-struct tanh_op
-{
-    std::string name() const { return "cpu::tanh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tanh(x); };
-    }
-};
+            migemm(result, args[0], args[1], op.alpha, op.beta);

-struct sigmoid_op
-{
-    std::string name() const { return "cpu::sigmoid"; }
-    auto fcn() const
-    {
-        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
-    }
-};
+            return result;
+        }

-struct neg_op
-{
-    std::string name() const { return "cpu::neg"; }
-    auto fcn() const
-    {
-        return [](auto x) { return -x; };
-    }
-};
+        // 2 input arguments
+        migemm(result, args[0], args[1], op.alpha, 0.0f);

-struct relu_op
-{
-    std::string name() const { return "cpu::relu"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::max(decltype(x){0}, x); };
+        return result;
    }
 };

@@ -561,16 +474,45 @@ template <typename Op>
 struct cpu_unary
 {
    Op op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op.op, f);
+    }
    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs}.has(1);
+        auto s = inputs.at(0);
+        if(s.packed())
+        {
+            return s;
+        }
+        else
+        {
+            return {s.type(), s.lens()};
+        }
+    }
+
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        result.visit([&](auto output) {
            args[0].visit([&](auto input) {
-                std::transform(input.begin(), input.end(), output.begin(), op.fcn());
+                if(input.get_shape().standard())
+                {
+                    std::transform(input.begin(), input.end(), output.begin(), op.fcn());
+                }
+                else
+                {
+                    shape_for_each(output.get_shape(), [&](const auto& idx) {
+                        output(idx.begin(), idx.end()) = op.fcn()(input(idx.begin(), idx.end()));
+                    });
+                }
            });
        });
+
        return result;
    }
 };
@@ -590,20 +532,20 @@ struct softmax2d
            auto nw          = input.get_shape().lens()[3];
            dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
                value_type cmax = std::numeric_limits<value_type>::lowest();
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    cmax = std::max(cmax, input(b, c, i, j));
                }
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
                }
                value_type sum = value_type(0);
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    sum += output(b, c, i, j);
                }
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    output(b, c, i, j) = output(b, c, i, j) / sum;
                }
@@ -613,83 +555,78 @@ struct softmax2d
    }
 };

-struct add_op
-{
-    std::string name() const { return "add"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x + y; };
-    }
-};
-
-struct sub_op
-{
-    std::string name() const { return "sub"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x - y; };
-    }
-};
-
-struct mul_op
+struct cpu_logsoftmax
 {
-    std::string name() const { return "mul"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x * y; };
-    }
-};
+    op::logsoftmax op;

-struct div_op
-{
-    std::string name() const { return "div"; }
-    auto fcn() const
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
    {
-        return [](auto x, auto y) { return x / y; };
+        return migraphx::reflect(self.op, f);
    }
-};

-struct max_op
-{
-    std::string name() const { return "max"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return std::max(x, y); };
-    }
-};
+    std::string name() const { return "cpu::logsoftmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

-struct min_op
-{
-    std::string name() const { return "min"; }
-    auto fcn() const
+    template <typename T>
+    std::size_t compute_batch_index(const T& idx, shape& batch_shape, int axis) const
    {
-        return [](auto x, auto y) { return std::min(x, y); };
+        if(axis == 0)
+        {
+            return 0;
+        }
+        else
+        {
+            std::vector<std::size_t> batch_idx(idx.begin(), idx.begin() + axis);
+            return batch_shape.index(batch_idx.begin(), batch_idx.end());
+        }
    }
-};

-template <typename Op>
-struct cpu_binary
-{
-    Op op;
-    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
-            if(input1.get_shape().packed() and input2.get_shape().packed())
-            {
-                std::transform(
-                    input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
-            }
-            else
+        auto lens = output_shape.lens();
+        std::vector<std::size_t> batch_lens{};
+        if(op.axis == 0)
+        {
+            batch_lens.push_back(1);
+        }
+        else
+        {
+            batch_lens.insert(batch_lens.begin(), lens.begin(), lens.begin() + op.axis);
+        }
+        shape batch_shape{migraphx::shape::uint32_type, batch_lens};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using value_type = typename decltype(input)::value_type;
+            std::vector<value_type> batch_max(batch_shape.elements(),
+                                              std::numeric_limits<value_type>::lowest());
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index       = this->compute_batch_index(idx, batch_shape, op.axis);
+                batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
+            });
+
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
+                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end()) - batch_max[index];
+            });
+
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
+                batch_sum[index] += std::exp(output(idx.begin(), idx.end()));
+            });
+
+            for(std::size_t i = 0; i < batch_sum.size(); ++i)
            {
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
-                    output(idx.begin(), idx.end()) =
-                        op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
-                });
+                batch_sum[i] = std::log(batch_sum[i]);
            }
+
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
+                output(idx.begin(), idx.end()) -= batch_sum[index];
+            });
        });
+
        return result;
    }
 };
@@ -713,42 +650,17 @@ struct cpu_apply

    void init()
    {
-        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
-        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
-        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["lrn"]        = extend_op<cpu_lrn, op::lrn>();
-        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
-        apply_map["pad"]        = extend_op<cpu_pad, op::pad>();
-        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();
-        apply_map["gather"]     = extend_op<cpu_gather, op::gather>();
-        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["elu"]        = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["identity"]   = simple_op<cpu_unary<identity_op>>();
-        apply_map["abs"]        = simple_op<cpu_unary<abs_op>>();
-        apply_map["sinh"]       = simple_op<cpu_unary<sinh_op>>();
-        apply_map["cosh"]       = simple_op<cpu_unary<cosh_op>>();
-        apply_map["tanh"]       = simple_op<cpu_unary<tanh_op>>();
-        apply_map["sigmoid"]    = simple_op<cpu_unary<sigmoid_op>>();
-        apply_map["exp"]        = simple_op<cpu_unary<exp_op>>();
-        apply_map["log"]        = simple_op<cpu_unary<log_op>>();
-        apply_map["neg"]        = simple_op<cpu_unary<neg_op>>();
-        apply_map["sin"]        = simple_op<cpu_unary<sin_op>>();
-        apply_map["cos"]        = simple_op<cpu_unary<cos_op>>();
-        apply_map["tan"]        = simple_op<cpu_unary<tan_op>>();
-        apply_map["asin"]       = simple_op<cpu_unary<asin_op>>();
-        apply_map["acos"]       = simple_op<cpu_unary<acos_op>>();
-        apply_map["atan"]       = simple_op<cpu_unary<atan_op>>();
-        apply_map["relu"]       = simple_op<cpu_unary<relu_op>>();
-        apply_map["add"]        = simple_op<cpu_binary<add_op>>();
-        apply_map["sub"]        = simple_op<cpu_binary<sub_op>>();
-        apply_map["mul"]        = simple_op<cpu_binary<mul_op>>();
-        apply_map["div"]        = simple_op<cpu_binary<div_op>>();
-        apply_map["max"]        = simple_op<cpu_binary<max_op>>();
-        apply_map["min"]        = simple_op<cpu_binary<min_op>>();
-
-        apply_map["softmax"] = simple_op<softmax2d>();
+        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
+        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
+        apply_map["elu"]         = extend_op<cpu_unary<elu_op>, op::elu>();
+        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
+        apply_map["leaky_relu"]  = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["logsoftmax"]  = extend_op<cpu_logsoftmax, op::logsoftmax>();
+        apply_map["lrn"]         = extend_op<cpu_lrn, op::lrn>();
+        apply_map["pad"]         = extend_op<cpu_pad, op::pad>();
+        apply_map["softmax"]     = simple_op<softmax2d>();
    }

    void apply()
@@ -764,9 +676,18 @@ struct cpu_apply
            {
                apply_map.at(it->name())(it);
            }
+            else if(is_context_free(it->get_operator()))
+            {
+                apply_cpu_op(it);
+            }
        }
    }

+    void apply_cpu_op(instruction_ref ins)
+    {
+        prog->replace_instruction(ins, cpu_op{ins->get_operator()}, ins->inputs());
+    }
+
    template <class T>
    void apply_simple_op(instruction_ref ins)
    {

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp

 #include <migraphx/cpu/target.hpp>
 #include <migraphx/cpu/lowering.hpp>
+#include <migraphx/pass.hpp>
 #include <migraphx/auto_contiguous.hpp>
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/dead_code_elimination.hpp>
@@ -13,8 +14,9 @@ std::string target::name() const { return "cpu"; }

 std::vector<pass> target::get_passes(migraphx::context&) const
 {
-    return {auto_contiguous{},
-            rewrite_rnn{},
+    return {rewrite_rnn{},
+            dead_code_elimination{},
+            auto_contiguous{},
            dead_code_elimination{},
            lowering{},
            dead_code_elimination{}};

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -26,11 +26,14 @@ add_library(migraphx_device
    device/atan.cpp
    device/add_relu.cpp
    device/contiguous.cpp
+    device/logsoftmax.cpp
+    device/convert.cpp
    device/mul.cpp
    device/concat.cpp
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/clip.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
@@ -48,6 +51,7 @@ add_library(migraphx_gpu
    pooling.cpp
    convolution.cpp
    softmax.cpp
+    logsoftmax.cpp
    contiguous.cpp
    concat.cpp
    relu.cpp
@@ -62,6 +66,9 @@ add_library(migraphx_gpu
    pad.cpp
    gather.cpp
    lrn.cpp
+    schedule_model.cpp
+    adjust_allocation.cpp
+    clip.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/abs.cpp
+++ b/src/targets/gpu/abs.cpp
 #include <migraphx/gpu/abs.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <utility>
+#include <migraphx/gpu/context.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -10,8 +7,8 @@ namespace gpu {

 shape miopen_abs::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
+    check_shapes{inputs, *this}.has(2).packed();
+    return inputs.at(0);
 }

 argument miopen_abs::compute(context& ctx,

--- a/src/targets/gpu/adjust_allocation.cpp
+++ b/src/targets/gpu/adjust_allocation.cpp
+#include <migraphx/gpu/adjust_allocation.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/iterator_for.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+void adjust_allocation::apply(program& p) const
+{
+    for(auto ins : iterator_for(p))
+    {
+        // skip instruction with no input
+        if(ins->inputs().empty())
+            continue;
+
+        if(ins->name() == "load")
+            continue;
+
+        auto alias_ins = instruction::get_output_alias(ins, true);
+        if(alias_ins->name() == "hip::allocate")
+        {
+            // shape allocated is different from actual shape
+            // of the instruction, reallocate and replace the previous one
+            if(alias_ins->get_shape() != ins->get_shape())
+            {
+                auto alloc_ins = p.insert_instruction(ins, hip_allocate{ins->get_shape()});
+                p.replace_instruction(alias_ins, alloc_ins);
+            }
+        }
+    }
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/batchnorm.cpp
+++ b/src/targets/gpu/batchnorm.cpp
 #include <migraphx/gpu/batchnorm.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <utility>
+#include <migraphx/gpu/context.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/clip.cpp
+++ b/src/targets/gpu/clip.cpp
+#include <migraphx/gpu/clip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/clip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_clip::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+
+argument hip_clip::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::clip(ctx.get_stream().get(), args.back(), args.front(), op.max_val, op.min_val);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/concat.cpp
+++ b/src/targets/gpu/concat.cpp
 #include <migraphx/gpu/concat.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device/concat.hpp>
-#include <utility>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/contiguous.cpp
+++ b/src/targets/gpu/contiguous.cpp
 #include <migraphx/gpu/contiguous.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <utility>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/contiguous.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/convolution.cpp
+++ b/src/targets/gpu/convolution.cpp
 #include <migraphx/gpu/convolution.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <utility>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/generate.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -23,19 +21,21 @@ argument miopen_convolution::compute(context& ctx,

    float alpha = 1;
    float beta  = 0;
-    miopenConvolutionForward(ctx.get_stream().get_miopen(),
-                             &alpha,
-                             x_desc.get(),
-                             args[0].implicit(),
-                             w_desc.get(),
-                             args[1].implicit(),
-                             cd.get(),
-                             algo,
-                             &beta,
-                             y_desc.get(),
-                             args[3].implicit(),
-                             args[2].implicit(),
-                             args[2].get_shape().bytes());
+    auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
+                                           &alpha,
+                                           x_desc.get(),
+                                           args[0].implicit(),
+                                           w_desc.get(),
+                                           args[1].implicit(),
+                                           cd.get(),
+                                           algo,
+                                           &beta,
+                                           y_desc.get(),
+                                           args[3].implicit(),
+                                           args[2].implicit(),
+                                           args[2].get_shape().bytes());
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("Running convolution failed");
    return args[3];
 }

@@ -91,8 +91,11 @@ void miopen_convolution::finalize(context& ctx,
 {
    if(handle == ctx.get_stream().get_miopen())
        return;
-    // TODO: Check that workspace hasn't changed
-    compile(ctx, output_shape, std::move(inputs));
+    // Check that workspace hasn't changed
+    auto size = inputs.at(2).bytes();
+    auto ws   = compile(ctx, output_shape, std::move(inputs));
+    if(ws.bytes() > size)
+        MIGRAPHX_THROW("Workspace has changed during finalization.");
 }

 } // namespace gpu

--- a/src/targets/gpu/device/clip.cpp
+++ b/src/targets/gpu/device/clip.cpp
+#include <migraphx/gpu/device/clip.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void clip(hipStream_t stream,
+          const argument& result,
+          const argument& arg1,
+          const float max,
+          const float min)
+{
+    nary(stream, result, arg1)(
+        [max, min](auto x) { return std::min<decltype(x)>(std::max<decltype(x)>(min, x), max); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx