Merge branch 'develop' into py

8a3d1d09 · Paul · fc8c2664 · 6972ad26 · 8a3d1d09 · 8a3d1d09
Commit 8a3d1d09 authored Feb 09, 2019 by Paul
20 changed files
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,6 +11,7 @@ add_library(migraphx
    eliminate_contiguous.cpp
    eliminate_concat.cpp
    fwd_conv_batchnorm_rewrite.cpp
+    rewrite_rnn.cpp
    env.cpp
    generate.cpp
    instruction.cpp

--- a/src/auto_contiguous.cpp
+++ b/src/auto_contiguous.cpp
@@ -12,7 +12,7 @@ void auto_contiguous::apply(program& p) const
    for(auto ins : iterator_for(p))
    {
        shape s = ins->get_shape();
-        if(not s.standard())
+        if(not s.standard() and s.elements() != 0)
        {
            auto c = p.insert_instruction(std::next(ins), op::contiguous{}, ins);
            p.replace_instruction(ins, c);

--- a/src/dead_code_elimination.cpp
+++ b/src/dead_code_elimination.cpp
@@ -41,8 +41,9 @@ void dead_code_elimination::apply(program& p) const
        // Skip the last instruction
        if(i == last)
            break;
-        // Skip instruction with empty shape as output unless its a builtin
-        if(i->get_shape().elements() == 0 and not(i->name().front() == '@'))
+        // Skip instruction with empty shape as output unless its a builtin or undefined
+        if(i->get_shape().elements() == 0 and not(i->name().front() == '@') and
+           not(i->name() == "undefined"))
            continue;
        assert(bidistance(p, i, last) > 0);
        fix([&](auto self, auto leaf) {

--- a/src/include/migraphx/auto_any_cast.hpp
+++ b/src/include/migraphx/auto_any_cast.hpp
@@ -5,6 +5,10 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+// Forward declare any_cast
+template <class T>
+const T& any_cast(const T&);
+
 namespace detail {

 template <class U>

--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -7,17 +7,17 @@
 #include <memory>
 #include <type_traits>
 #include <utility>
-#include <migraphx/shape.hpp>
 #include <migraphx/reflect.hpp>
 #include <migraphx/streamutils.hpp>
 #include <migraphx/argument.hpp>
-#include <migraphx/context.hpp>
 #include <migraphx/auto_any_cast.hpp>
 #include <migraphx/config.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+struct context;
+
 #ifdef DOXYGEN

 /// The operation interface represents an action an instruction will perform. All

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -60,6 +60,30 @@ struct batch_norm_inference
    }
 };

+struct lrn
+{
+    float alpha = 0.0001;
+    float beta  = 0.75;
+    float bias  = 1.0;
+    int size    = 1;
+    std::string name() const { return "lrn"; }
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.alpha, "alpha"),
+                    f(self.beta, "beta"),
+                    f(self.bias, "bias"),
+                    f(self.size, "size"));
+    }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        return inputs.front();
+    }
+};
+
 struct convolution
 {
    std::array<std::size_t, 2> padding  = {{0, 0}};
@@ -358,6 +382,17 @@ struct contiguous
        auto t    = inputs.at(0).type();
        return {t, lens};
    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        assert(output_shape.standard());
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            shape_for_each(output.get_shape(), [&](const auto& idx) {
+                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());
+            });
+        });
+        return result;
+    }
 };

 struct concat
@@ -430,7 +465,6 @@ struct concat
        }
        return result;
    }
-    int output_alias(const std::vector<shape>&) const { return 0; }
 };

 struct slice
@@ -616,11 +650,16 @@ struct reshape
        {
            if(dims[i] == 0)
                rdims[i] = idims[i];
+
+            // since rdims using size_t type, -1 is the max value
+            // is size_t that cause later compuation incorrect
+            if(dims[i] == -1)
+                rdims[i] = 1;
        }
        if(n_neg_dims > 0)
        {
            size_t missing_dim =
-                -inputs.front().elements() /
+                inputs.front().elements() /
                std::accumulate(rdims.begin(), rdims.end(), 1, std::multiplies<int64_t>());
            for(std::size_t i = 0; i < rdims.size(); i++)
            {
@@ -628,11 +667,7 @@ struct reshape
                    rdims[i] = missing_dim;
            }
        }
-        if(dims.back() == -1)
-        {
-            rdims.pop_back();
-            std::copy(idims.begin() + rdims.size(), idims.end(), std::back_inserter(rdims));
-        }
+
        shape s{inputs.front().type(), rdims};
        if(s.elements() != inputs.front().elements())
            MIGRAPHX_THROW("Wrong number of elements for reshape");
@@ -764,8 +799,6 @@ struct gather

        return result;
    }
-
-    int output_alias(const std::vector<shape>&) const { return 0; }
 };

 struct dot
@@ -1131,6 +1164,113 @@ struct outline
    argument compute(const shape&, const std::vector<argument>&) const { return {s, nullptr}; }
 };

+// indicate rnn computation direction
+enum class rnn_direction
+{
+    forward,
+    reverse,
+    bidirectional,
+};
+
+struct rnn
+{
+    std::size_t hidden_size = 1;
+    std::vector<operation> actv_funcs{tanh{}, tanh{}};
+    rnn_direction direction = rnn_direction::forward;
+    float clip              = 0.0f;
+
+    std::string name() const { return "rnn"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        auto in_dims     = inputs[0].lens();
+        auto hidden_dims = inputs[2].lens();
+        if(hidden_size != hidden_dims[2])
+        {
+            MIGRAPHX_THROW("RNN: hidden size mismatch in attribute and input");
+        }
+
+        std::size_t num_directions = 1;
+        if(direction == rnn_direction::bidirectional)
+        {
+            num_directions = 2;
+        }
+
+        if(num_directions != hidden_dims[0])
+        {
+            MIGRAPHX_THROW("RNN: num_direction mismatch in attribute and input");
+        }
+
+        std::vector<std::size_t> out_dims(in_dims);
+        out_dims.insert(out_dims.begin() + 1, num_directions);
+        out_dims.back() = hidden_size;
+
+        return {inputs[0].type(), out_dims};
+    }
+};
+
+struct rnn_last_output
+{
+    std::string name() const { return "rnn_last_output"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        auto dims = inputs[0].lens();
+
+        // remove the first dimension, remaing are output shape
+        dims.erase(dims.begin());
+        return {inputs[0].type(), dims};
+    }
+};
+
+struct gru
+{
+    std::size_t hidden_size = 1;
+    std::vector<operation> actv_funcs{sigmoid{}, tanh{}};
+    rnn_direction direction = rnn_direction::forward;
+    float clip              = 0.0f;
+    int linear_before_reset = 0;
+
+    std::string name() const { return "gru"; }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        auto in_dims     = inputs[0].lens();
+        auto hidden_dims = inputs[2].lens();
+        if(hidden_size != hidden_dims[2])
+        {
+            MIGRAPHX_THROW("GRU: hidden size mismatch in attribute and input");
+        }
+
+        std::size_t num_directions = 1;
+        if(direction == rnn_direction::bidirectional)
+        {
+            num_directions = 2;
+        }
+
+        if(num_directions != hidden_dims[0])
+        {
+            MIGRAPHX_THROW("GRU: num_direction does not match the direction attribute");
+        }
+
+        std::vector<std::size_t> out_dims(in_dims);
+        out_dims.insert(out_dims.begin() + 1, num_directions);
+        out_dims.back() = hidden_size;
+
+        return {inputs[0].type(), out_dims};
+    }
+};
+
+struct undefined
+{
+    std::string name() const { return "undefined"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(0);
+        return {};
+    }
+
+    argument compute(const shape&, const std::vector<argument>&) const { return {{}, nullptr}; }
+};
+
 } // namespace op
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -105,6 +105,8 @@ struct program
    void debug_print(instruction_ref ins) const;
    void debug_print(const std::vector<instruction_ref>& inss) const;

+    void dry_run(parameter_map params) const;
+
    friend std::ostream& operator<<(std::ostream& os, const program& p);
    friend bool operator==(const program& x, const program& y);
    friend bool operator!=(const program& x, const program& y) { return !(x == y); }

--- a/src/include/migraphx/rewrite_rnn.hpp
+++ b/src/include/migraphx/rewrite_rnn.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_REWRITE_RNN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_REWRITE_RNN_HPP
+
+#include <string>
+#include <vector>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct program;
+
+/**
+ * Rewrite rnn to gemm and add.
+ */
+struct rewrite_rnn
+{
+    std::string name() const { return "rewrite_rnn"; }
+    void apply(program& prog) const;
+
+    private:
+    // for vanilla rnn operators
+    void apply_vanilla_rnn(program& prog, instruction_ref ins) const;
+    std::vector<instruction_ref> vanilla_rnn_cell(bool is_forward,
+                                                  program& prog,
+                                                  instruction_ref ins,
+                                                  instruction_ref input,
+                                                  instruction_ref w,
+                                                  instruction_ref r,
+                                                  instruction_ref bias,
+                                                  instruction_ref ih,
+                                                  operation& actv_func) const;
+    std::vector<operation> vanilla_rnn_actv_funcs(instruction_ref ins) const;
+
+    // for gru operators
+    void apply_gru(program& prog, instruction_ref ins) const;
+    std::vector<instruction_ref> gru_cell(bool is_forward,
+                                          program& prog,
+                                          instruction_ref ins,
+                                          std::vector<instruction_ref> inputs,
+                                          int linear_before_reset,
+                                          const operation& actv_func1,
+                                          const operation& actv_func2) const;
+
+    std::vector<operation> gru_actv_funcs(instruction_ref ins) const;
+};
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/instruction.cpp
+++ b/src/instruction.cpp
@@ -97,7 +97,7 @@ const std::vector<instruction_ref>& instruction::outputs() const { return output

 bool operator==(const instruction& x, const instruction& y)
 {
-    if(not(x.result == y.result and x.op == y.op and x.arguments == y.arguments))
+    if(std::tie(x.result, x.op, x.arguments) != std::tie(y.result, y.op, y.arguments))
        return false;
    if(x.name() == "@literal")
        return x.lit == y.lit;

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -32,6 +32,7 @@ struct onnx_parser
    bool is_pytorch = false;

    std::unordered_map<std::string, op_func> ops;
+    std::unordered_map<std::string, operation> map_actv_funcs;

    onnx_parser()
    {
@@ -63,6 +64,7 @@ struct onnx_parser
        add_variadic_op("Max", op::max{});
        add_variadic_op("Min", op::min{});

+        add_mem_op("LRN", &onnx_parser::parse_lrn);
        add_mem_op("ImageScaler", &onnx_parser::parse_imagescaler);
        add_mem_op("LeakyRelu", &onnx_parser::parse_leaky_relu);
        add_mem_op("Elu", &onnx_parser::parse_elu);
@@ -85,7 +87,21 @@ struct onnx_parser
        add_mem_op("Shape", &onnx_parser::parse_shape);
        add_mem_op("ConstantFill", &onnx_parser::parse_constant_fill);
        add_mem_op("Transpose", &onnx_parser::parse_transpose);
+        add_mem_op("RNN", &onnx_parser::parse_rnn);
+        add_mem_op("GRU", &onnx_parser::parse_gru);
        add_mem_op("Pad", &onnx_parser::parse_pad);
+
+        // init the activation function map
+        init_actv_func();
+    }
+
+    void init_actv_func()
+    {
+        map_actv_funcs.insert(std::make_pair("tanh", op::tanh{}));
+        map_actv_funcs.insert(std::make_pair("relu", op::relu{}));
+        map_actv_funcs.insert(std::make_pair("sigmoid", op::sigmoid{}));
+        map_actv_funcs.insert(std::make_pair("leakyrelu", op::leaky_relu{}));
+        map_actv_funcs.insert(std::make_pair("elu", op::elu{}));
    }

    template <class F>
@@ -522,6 +538,25 @@ struct onnx_parser
        return prog.add_instruction(op, args.front());
    }

+    instruction_ref
+    parse_lrn(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
+    {
+        float alpha = 0.0001;
+        float beta  = 0.75;
+        float bias  = 1.0;
+        int size    = 1;
+        if(contains(attributes, "alpha"))
+            alpha = parse_value(attributes.at("alpha")).at<float>();
+        if(contains(attributes, "beta"))
+            beta = parse_value(attributes.at("beta")).at<float>();
+        if(contains(attributes, "bias"))
+            bias = parse_value(attributes.at("bias")).at<float>();
+        if(contains(attributes, "size"))
+            size = parse_value(attributes.at("size")).at<int>();
+        op::lrn op{alpha, beta, bias, size};
+        return prog.add_instruction(op, args.front());
+    }
+
    instruction_ref parse_imagescaler(const std::string&,
                                      attribute_map attributes,
                                      std::vector<instruction_ref> args)
@@ -677,6 +712,214 @@ struct onnx_parser
        }
    }

+    std::vector<instruction_ref>
+    parse_rnn(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
+    {
+        migraphx::shape input_shape = args[0]->get_shape();
+        std::size_t hidden_size     = args[1]->get_shape().lens()[1];
+
+        if(contains(attributes, "hidden_size"))
+        {
+            std::size_t hidden_size_att = parse_value(attributes.at("hidden_size")).at<int>();
+            if(hidden_size != hidden_size_att)
+            {
+                MIGRAPHX_THROW("RNN: hidden size mismatch in input and attribute");
+            }
+        }
+
+        // Handling of direction to be added later
+        std::string direction{"forward"};
+        if(contains(attributes, "direction"))
+        {
+            direction = attributes.at("direction").s();
+        }
+
+        op::rnn_direction dirct = op::rnn_direction::forward;
+        if(direction == "bidirectional")
+        {
+            dirct = op::rnn_direction::bidirectional;
+        }
+        else if(direction == "reverse")
+        {
+            dirct = op::rnn_direction::reverse;
+        }
+
+        std::vector<std::string> vec_names{"tanh"};
+        if(contains(attributes, "activations"))
+        {
+            auto names = attributes.at("activations").strings();
+            vec_names.clear();
+            for_each(names.begin(), names.end(), [&](auto& fn) { vec_names.push_back(fn); });
+        }
+
+        for_each(vec_names.begin(), vec_names.end(), [&](auto& fn) {
+            if(map_actv_funcs.count(fn) == 0)
+            {
+                MIGRAPHX_THROW("RNN: activation function " + std::string(fn) + " not supported");
+            }
+        });
+
+        // bidirectional case should have two activation functions.
+        // one is for forward, and the other is for reverse.
+        // if only one actv function is provided, we use it in both
+        // forward and reverse direction
+        if(dirct == op::rnn_direction::bidirectional)
+        {
+            if(vec_names.size() == 1)
+            {
+                vec_names.push_back(vec_names.at(0));
+            }
+        }
+
+        std::vector<operation> vec_actv_funcs(vec_names.size());
+        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& fn) {
+            return map_actv_funcs[fn];
+        });
+
+        // To be added later
+        float clip = 0.0;
+        if(contains(attributes, "clip"))
+        {
+            clip = parse_value(attributes.at("clip")).at<float>();
+        }
+
+        // if the number of arguments is less than 6, append
+        // undefined operator to have 6 arguments
+        if(args.size() < 6)
+        {
+            auto ins = prog.add_instruction(op::undefined{});
+            args.insert(args.end(), (6 - args.size()), ins);
+        }
+
+        // first output for the concatenation of hidden states
+        auto hidden_states = prog.add_instruction(op::rnn{hidden_size, vec_actv_funcs, dirct, clip},
+                                                  std::move(args));
+
+        // second output for the last hidden state
+        auto last_output = prog.add_instruction(op::rnn_last_output{}, hidden_states);
+
+        return {hidden_states, last_output};
+    }
+
+    std::vector<instruction_ref>
+    parse_gru(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
+    {
+        migraphx::shape input_shape = args[0]->get_shape();
+        std::size_t hidden_size     = args[2]->get_shape().lens()[2];
+
+        if(contains(attributes, "hidden_size"))
+        {
+            std::size_t hidden_size_att = parse_value(attributes.at("hidden_size")).at<int>();
+            if(hidden_size != hidden_size_att)
+            {
+                MIGRAPHX_THROW("GRU: hidden size mismatch in input and attribute");
+            }
+        }
+
+        // Handling of direction to be added later
+        std::string direction{"forward"};
+        if(contains(attributes, "direction"))
+        {
+            direction = attributes.at("direction").s();
+        }
+
+        op::rnn_direction dirct = op::rnn_direction::forward;
+        if(direction == "bidirectional")
+        {
+            dirct = op::rnn_direction::bidirectional;
+        }
+        else if(direction == "reverse")
+        {
+            dirct = op::rnn_direction::reverse;
+        }
+
+        std::vector<std::string> vec_names = {"sigmoid", "tanh"};
+        if(contains(attributes, "activations"))
+        {
+            auto names = attributes.at("activations").strings();
+            vec_names.clear();
+            vec_names.resize(names.size());
+            std::transform(
+                names.begin(), names.end(), vec_names.begin(), [](auto& str) { return str; });
+        }
+
+        // need 4 activation functions
+        if(dirct == op::rnn_direction::bidirectional)
+        {
+            // 4 activation functions are used in the bidirectional
+            // scenario. No spec is provided in onnx::operator. we
+            // use the algorithm that: if 1 actv function is provided,
+            // repeat 1 four times. If 2 actv functins are provided,
+            // assume forward and reverse use the same pair of actv
+            // functions. For the case of 3 actv functions provided,
+            // assume the 3rd one is repeated once and used by the
+            // reverse direction.
+            // This may need change later
+            if(vec_names.size() == 1)
+            {
+                vec_names.insert(vec_names.end(), 3, vec_names.at(0));
+            }
+            else if(vec_names.size() == 2)
+            {
+                // repeat the activation functions
+                vec_names.push_back(vec_names.at(0));
+                vec_names.push_back(vec_names.at(1));
+            }
+            else if(vec_names.size() == 3)
+            {
+                vec_names.push_back(vec_names.at(2));
+            }
+        }
+        else
+        {
+            if(vec_names.size() == 1)
+            {
+                vec_names.push_back(vec_names.at(0));
+            }
+        }
+
+        for_each(vec_names.begin(), vec_names.end(), [&](auto& name) {
+            if(map_actv_funcs.count(name) == 0)
+            {
+                MIGRAPHX_THROW("GRU: activation function " + std::string(name) + " not supported");
+            }
+        });
+
+        std::vector<operation> vec_actv_funcs(vec_names.size());
+        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& name) {
+            return map_actv_funcs[name];
+        });
+
+        float clip = 0.0;
+        if(contains(attributes, "clip"))
+        {
+            clip = parse_value(attributes.at("clip")).at<float>();
+        }
+
+        int linear_before_reset = 0;
+        if(contains(attributes, "linear_before_reset"))
+        {
+            linear_before_reset = parse_value(attributes.at("linear_before_reset")).at<int>();
+        }
+
+        // append undefined opeator to make 6 arguments
+        if(args.size() < 6)
+        {
+            auto ins = prog.add_instruction(op::undefined{});
+            args.insert(args.end(), 6 - args.size(), ins);
+        }
+
+        // first output for concatenation of hidden states
+        auto hidden_states = prog.add_instruction(
+            op::gru{hidden_size, vec_actv_funcs, dirct, clip, linear_before_reset},
+            std::move(args));
+
+        // second output for last gru output
+        auto last_output = prog.add_instruction(op::rnn_last_output{}, hidden_states);
+
+        return {hidden_states, last_output};
+    }
+
    void parse_from(std::istream& is)
    {
        onnx::ModelProto model;
@@ -723,6 +966,12 @@ struct onnx_parser
        }
    }

+    void parse_undefined(const std::string& name)
+    {
+        auto ins           = prog.add_instruction(op::undefined{});
+        instructions[name] = ins;
+    }
+
    void parse_node(const std::string& name)
    {
        if(name.empty())
@@ -737,12 +986,12 @@ struct onnx_parser
                {
                    assert(name != input);
                    this->parse_node(input);
-                    args.push_back(instructions.at(input));
                }
-                else
+                else if(input.empty())
                {
-                    args.push_back(instructions.at(input));
+                    this->parse_undefined(input);
                }
+                args.push_back(instructions.at(input));
            }
            std::vector<instruction_ref> result;
            if(ops.count(node.op_type()) == 0)

--- a/src/program.cpp
+++ b/src/program.cpp
 #include <migraphx/program.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/time.hpp>
@@ -134,6 +135,12 @@ instruction_ref program::replace_instruction(instruction_ref ins, instruction_re
    assert(has_instruction(ins));
    assert(has_instruction(rep));
    assert(ins != rep);
+
+    if(ins == std::prev(this->end()))
+    {
+        return replace_instruction(ins, op::identity{}, rep);
+    }
+
    // TODO: Should it be an error if the output is empty?
    if(ins->outputs().empty())
    {
@@ -372,20 +379,31 @@ argument generic_eval(const program& p,

 argument program::eval(std::unordered_map<std::string, argument> params) const
 {
+    auto& ctx = this->impl->ctx;
+#ifndef NDEBUG
+    auto sctx          = ctx;
+    auto check_context = [&](auto f) {
+        assert(is_shared(ctx, sctx));
+        auto x = f();
+        sctx   = ctx;
+        return x;
+    };
+#else
+    auto check_context = [](auto f) { return f(); };
+#endif
    if(enabled(MIGRAPHX_TRACE_EVAL{}))
    {
-        auto& ctx = this->impl->ctx;
-        return generic_eval(*this, this->impl->ctx, std::move(params), [&](auto& ins, auto f) {
+        return generic_eval(*this, ctx, std::move(params), [&](auto& ins, auto f) {
            ctx.finish();
            std::cout << "Run instruction: ";
            this->debug_print(ins);
-            return f();
+            return check_context(f);
        });
    }
    else
    {
        return generic_eval(
-            *this, this->impl->ctx, std::move(params), [](auto&, auto f) { return f(); });
+            *this, ctx, std::move(params), [&](auto&, auto f) { return check_context(f); });
    }
 }

@@ -439,8 +457,7 @@ void program::perf_report(std::ostream& os, std::size_t n, parameter_map params)
    overhead_vec.reserve(n);
    for(std::size_t i = 0; i < n; i++)
    {
-        overhead_vec.push_back(time<milliseconds>(
-            [&] { generic_eval(*this, ctx, params, [](auto...) { return argument{}; }); }));
+        overhead_vec.push_back(time<milliseconds>([&] { dry_run(params); }));
    }

    double total_time             = common_average(total_vec);
@@ -504,6 +521,12 @@ void program::debug_print(const std::vector<instruction_ref>& inss) const
    std::cout << std::endl;
 }

+void program::dry_run(std::unordered_map<std::string, argument> params) const
+{
+    auto& ctx = this->impl->ctx;
+    generic_eval(*this, ctx, std::move(params), [](auto&&...) { return argument{}; });
+}
+
 bool operator==(const program& x, const program& y) { return to_string(x) == to_string(y); }

 std::ostream& operator<<(std::ostream& os, const program& p)

--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
+#include <migraphx/rewrite_rnn.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/dfor.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+void rewrite_rnn::apply(program& prog) const
+{
+    for(auto ins : iterator_for(prog))
+    {
+        if(ins->name() == "rnn")
+        {
+            apply_vanilla_rnn(prog, ins);
+        }
+
+        if(ins->name() == "gru")
+        {
+            apply_gru(prog, ins);
+        }
+    }
+}
+
+void rewrite_rnn::apply_vanilla_rnn(program& prog, instruction_ref ins) const
+{
+    assert(ins->name() == "rnn");
+    // could be 3 to 6 inputs, but the parse_rnn function will
+    // append undefined operators to make 6 arguments when parsing
+    // an onnx file. Another case is user can have num of arguments
+    // when writing their program.
+    auto args = ins->inputs();
+
+    shape seq_shape         = args[0]->get_shape();
+    std::size_t hidden_size = args[1]->get_shape().lens()[1];
+    std::size_t batch_size  = seq_shape.lens()[1];
+    shape::type_t type      = seq_shape.type();
+    migraphx::shape ih_shape{type, {1, batch_size, hidden_size}};
+    std::vector<float> data(ih_shape.elements(), 0);
+
+    auto actv_funcs         = vanilla_rnn_actv_funcs(ins);
+    auto rnn_op             = any_cast<op::rnn>(ins->get_operator());
+    op::rnn_direction dicrt = rnn_op.direction;
+    instruction_ref last_output{};
+    if(dicrt == op::rnn_direction::bidirectional)
+    {
+        // input weight matrix
+        auto w_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[1]);
+        auto w_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[1]);
+
+        // hidden state weight matrix
+        auto r_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[2]);
+        auto r_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[2]);
+
+        // process bias
+        instruction_ref bias_forward = prog.end();
+        instruction_ref bias_reverse = prog.end();
+        if(args.size() >= 4 && args[3]->name() != "undefined")
+        {
+            bias_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[3]);
+            bias_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[3]);
+        }
+
+        // process intial hidden state, it could be the 6th argument
+        // or the 5th one (if the sequence len argument is ignored)
+        instruction_ref ih_forward{};
+        instruction_ref ih_reverse{};
+        if(args.size() == 6 && args[5]->name() != "undefined")
+        {
+            ih_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[5]);
+            ih_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[5]);
+        }
+        else
+        {
+            ih_forward = prog.add_literal(migraphx::literal{ih_shape, data});
+            ih_reverse = prog.add_literal(migraphx::literal{ih_shape, data});
+        }
+
+        auto ret_forward = vanilla_rnn_cell(true,
+                                            prog,
+                                            ins,
+                                            args[0],
+                                            w_forward,
+                                            r_forward,
+                                            bias_forward,
+                                            ih_forward,
+                                            actv_funcs.at(0));
+        auto ret_reverse = vanilla_rnn_cell(false,
+                                            prog,
+                                            ins,
+                                            args[0],
+                                            w_reverse,
+                                            r_reverse,
+                                            bias_reverse,
+                                            ih_reverse,
+                                            actv_funcs.at(1));
+
+        auto concat_output =
+            prog.insert_instruction(ins, op::concat{1}, ret_forward[1], ret_reverse[1]);
+        last_output = prog.insert_instruction(ins, op::squeeze{{0}}, concat_output);
+
+        // The following logic is to ensure the last instruction rewritten from
+        // rnn operator is a concat instruction
+        // sequence len is 1
+        if(ret_forward[0] == prog.end())
+        {
+            prog.replace_instruction(ins, op::concat{1}, ret_forward[1], ret_reverse[1]);
+        }
+        else
+        {
+            ret_forward[0] =
+                prog.insert_instruction(ins, op::concat{0}, ret_forward[0], ret_forward[1]);
+            ret_reverse[0] =
+                prog.insert_instruction(ins, op::concat{0}, ret_reverse[1], ret_reverse[0]);
+            prog.replace_instruction(ins, op::concat{1}, {ret_forward[0], ret_reverse[0]});
+        }
+    }
+    else
+    {
+        bool is_forward = (dicrt == op::rnn_direction::forward);
+        // input weight matrix
+        auto w = args[1];
+
+        // hidden state weight matrix
+        auto r = args[2];
+
+        // process bias and initial hidden state
+        instruction_ref bias = prog.end();
+        if(args.size() >= 4 && args[3]->name() != "undefined")
+        {
+            bias = args[3];
+        }
+
+        // process intial hidden state
+        instruction_ref ih;
+        if(args.size() == 6 && args[5]->name() != "undefined")
+        {
+            ih = args[5];
+        }
+        else
+        {
+            ih = prog.add_literal(migraphx::literal{ih_shape, data});
+        }
+
+        auto ret =
+            vanilla_rnn_cell(is_forward, prog, ins, args[0], w, r, bias, ih, actv_funcs.at(0));
+        last_output = prog.insert_instruction(ins, op::squeeze{{0}}, ret[1]);
+
+        // following logic is to ensure the last instruction is a
+        // concat instruction
+        // sequence len is 1
+        if(ret[0] == prog.end())
+        {
+            prog.replace_instruction(ins, op::concat{0}, ret[1]);
+        }
+        else
+        {
+            auto concat_arg0 = is_forward ? ret[0] : ret[1];
+            auto concat_arg1 = is_forward ? ret[1] : ret[0];
+            prog.replace_instruction(ins, op::concat{0}, concat_arg0, concat_arg1);
+        }
+    }
+
+    // search its output to find if there are rnn_last_output operator
+    // while loop to handle case of multiple rnn_last_output operators
+    auto last_output_it = ins->outputs().begin();
+    while(last_output_it != ins->outputs().end())
+    {
+        last_output_it = std::find_if(last_output_it, ins->outputs().end(), [](auto i) {
+            return i->name() == "rnn_last_output";
+        });
+
+        if(last_output_it != ins->outputs().end())
+        {
+            prog.replace_instruction(*last_output_it, last_output);
+            last_output_it++;
+        }
+    }
+}
+
+std::vector<instruction_ref> rewrite_rnn::vanilla_rnn_cell(bool is_forward,
+                                                           program& prog,
+                                                           instruction_ref ins,
+                                                           instruction_ref input,
+                                                           instruction_ref w,
+                                                           instruction_ref r,
+                                                           instruction_ref bias,
+                                                           instruction_ref ih,
+                                                           operation& actv_func) const
+{
+    // squeeze and transpose w
+    std::vector<int64_t> perm{1, 0};
+    auto sw      = prog.insert_instruction(ins, op::squeeze{{0}}, w);
+    auto tran_sw = prog.insert_instruction(ins, op::transpose{perm}, sw);
+
+    // squeeze and transpose r
+    auto sr      = prog.insert_instruction(ins, op::squeeze{{0}}, r);
+    auto tran_sr = prog.insert_instruction(ins, op::transpose{perm}, sr);
+
+    // initial hidden state
+    auto sih = prog.insert_instruction(ins, op::squeeze{{0}}, ih);
+
+    // bias
+    if(bias != prog.end())
+    {
+        long hs    = r->get_shape().lens()[2];
+        auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
+        auto wb    = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
+        auto rb    = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
+        auto b     = prog.insert_instruction(ins, op::add{}, wb, rb);
+        bias       = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, b);
+    }
+
+    instruction_ref hidden_out = prog.end();
+    instruction_ref last_out{};
+    last_out            = prog.insert_instruction(ins, op::unsqueeze{{0, 1}}, sih);
+    std::size_t seq_len = input->get_shape().lens()[0];
+    for(std::size_t i = 0; i < seq_len; i++)
+    {
+        long seq_index = is_forward ? i : (seq_len - 1 - i);
+        auto xt = prog.insert_instruction(ins, op::slice{{0}, {seq_index}, {seq_index + 1}}, input);
+        xt      = prog.insert_instruction(ins, op::squeeze{{0}}, xt);
+        auto xt_wi = prog.insert_instruction(ins, op::dot{}, xt, tran_sw);
+        auto ht_ri = prog.insert_instruction(ins, op::dot{}, sih, tran_sr);
+        auto xt_ht = prog.insert_instruction(ins, op::add{}, xt_wi, ht_ri);
+        instruction_ref ht;
+        if(bias != prog.end())
+        {
+            ht = prog.insert_instruction(ins, op::add{}, xt_ht, bias);
+        }
+        else
+        {
+            ht = xt_ht;
+        }
+
+        // apply activation function
+        ht  = prog.insert_instruction(ins, actv_func, ht);
+        sih = ht;
+
+        // add the dimensions of sequence length (axis 0 for sequence length,
+        // axis 1 for num_directions
+        last_out = prog.insert_instruction(ins, op::unsqueeze{{0, 1}}, ht);
+
+        // concatenation for the last last_out is performed in the apply()
+        // function to ensure the last instruction is concat, then we have
+        // output inserted
+        if(i < seq_len - 1)
+        {
+            if(is_forward)
+            {
+                hidden_out =
+                    (seq_index == 0)
+                        ? last_out
+                        : prog.insert_instruction(ins, op::concat{0}, hidden_out, last_out);
+            }
+            else
+            {
+                hidden_out =
+                    (seq_index == seq_len - 1)
+                        ? last_out
+                        : prog.insert_instruction(ins, op::concat{0}, last_out, hidden_out);
+            }
+        }
+    }
+
+    return {hidden_out, last_out};
+}
+
+std::vector<operation> rewrite_rnn::vanilla_rnn_actv_funcs(instruction_ref ins) const
+{
+    auto rnn_op = any_cast<op::rnn>(ins->get_operator());
+    // could be 3 to 6 inputs, but the parse_gru function will
+    // append undefined operators to make 6 arguments when parsing
+    // an onnx file. Another case is user can have any num of arguments
+    // when writing their program.
+    if(rnn_op.direction == op::rnn_direction::bidirectional)
+    {
+        if(rnn_op.actv_funcs.empty())
+        {
+            // default is tanh
+            return {op::tanh{}, op::tanh{}};
+        }
+        else if(rnn_op.actv_funcs.size() == 1)
+        {
+            return {rnn_op.actv_funcs.at(0), rnn_op.actv_funcs.at(0)};
+        }
+        else
+        {
+            return rnn_op.actv_funcs;
+        }
+    }
+    else
+    {
+        if(rnn_op.actv_funcs.empty())
+        {
+            // default is tanh
+            return {op::tanh{}};
+        }
+        else
+        {
+            return rnn_op.actv_funcs;
+        }
+    }
+}
+
+void rewrite_rnn::apply_gru(program& prog, instruction_ref ins) const
+{
+    assert(ins->name() == "gru");
+    const auto actv_funcs = gru_actv_funcs(ins);
+    // could be 3 to 6 inputs, but the parse_gru function will
+    // append undefined operators to make 6 arguments when parsing
+    // an onnx file. Another case is user can have num of arguments
+    // when writing their program.
+    auto args = ins->inputs();
+
+    shape seq_shape         = args[0]->get_shape();
+    std::size_t hidden_size = args[2]->get_shape().lens()[2];
+    std::size_t batch_size  = seq_shape.lens()[1];
+    shape::type_t type      = seq_shape.type();
+    migraphx::shape ih_shape{type, {1, batch_size, hidden_size}};
+    std::vector<float> data(ih_shape.elements(), 0.0);
+
+    auto gru_op             = any_cast<op::gru>(ins->get_operator());
+    op::rnn_direction dicrt = gru_op.direction;
+    instruction_ref last_output{};
+    if(dicrt == op::rnn_direction::bidirectional)
+    {
+        // w weight matrix
+        auto w_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[1]);
+        auto w_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[1]);
+
+        // r weight matrix
+        auto r_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[2]);
+        auto r_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[2]);
+
+        // bias
+        instruction_ref bias_forward = prog.end();
+        instruction_ref bias_reverse = prog.end();
+        if(args.size() >= 4 && args[3]->name() != "undefined")
+        {
+            bias_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[3]);
+            bias_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[3]);
+        }
+
+        // intial hidden state
+        instruction_ref ih_forward{};
+        instruction_ref ih_reverse{};
+        if(args.size() == 6 && args[5]->name() != "undefined")
+        {
+            ih_forward = prog.insert_instruction(ins, op::slice{{0}, {0}, {1}}, args[5]);
+            ih_reverse = prog.insert_instruction(ins, op::slice{{0}, {1}, {2}}, args[5]);
+        }
+        else
+        {
+            ih_forward = prog.add_literal(migraphx::literal{ih_shape, data});
+            ih_reverse = prog.add_literal(migraphx::literal{ih_shape, data});
+        }
+
+        auto ret_forward = gru_cell(true,
+                                    prog,
+                                    ins,
+                                    {args[0], w_forward, r_forward, bias_forward, ih_forward},
+                                    gru_op.linear_before_reset,
+                                    actv_funcs.at(0),
+                                    actv_funcs.at(1));
+
+        auto ret_reverse = gru_cell(false,
+                                    prog,
+                                    ins,
+                                    {args[0], w_reverse, r_reverse, bias_reverse, ih_reverse},
+                                    gru_op.linear_before_reset,
+                                    actv_funcs.at(2),
+                                    actv_funcs.at(3));
+
+        auto concat_output =
+            prog.insert_instruction(ins, op::concat{1}, ret_forward[1], ret_reverse[1]);
+        last_output = prog.insert_instruction(ins, op::squeeze{{0}}, concat_output);
+
+        // The following logic is to ensure the last instruction rewritten
+        // from gru operator is a concat
+        if(ret_forward[0] == prog.end())
+        {
+            prog.replace_instruction(ins, op::concat{1}, ret_forward[1], ret_reverse[1]);
+        }
+        else
+        {
+            ret_forward[0] =
+                prog.insert_instruction(ins, op::concat{0}, ret_forward[0], ret_forward[1]);
+            ret_reverse[0] =
+                prog.insert_instruction(ins, op::concat{0}, ret_reverse[1], ret_reverse[0]);
+            prog.replace_instruction(ins, op::concat{1}, {ret_forward[0], ret_reverse[0]});
+        }
+    }
+    else
+    {
+        bool is_forward = (dicrt == op::rnn_direction::forward);
+        // weight matrix
+        auto w = args[1];
+        auto r = args[2];
+
+        // bias
+        instruction_ref bias = prog.end();
+        if(args.size() >= 4 && args[3]->name() != "undefined")
+        {
+            bias = args[3];
+        }
+
+        // intial hidden state
+        instruction_ref ih{};
+        if(args.size() == 6 && args[5]->name() != "undefined")
+        {
+            ih = args[5];
+        }
+        else
+        {
+            ih = prog.add_literal(migraphx::literal{ih_shape, data});
+        }
+
+        auto ret = gru_cell(is_forward,
+                            prog,
+                            ins,
+                            {args[0], w, r, bias, ih},
+                            gru_op.linear_before_reset,
+                            actv_funcs.at(0),
+                            actv_funcs.at(1));
+
+        last_output = prog.insert_instruction(ins, op::squeeze{{0}}, ret[1]);
+
+        if(ret[0] == prog.end())
+        {
+            prog.replace_instruction(ins, op::concat{0}, ret[1]);
+        }
+        else
+        {
+            auto concat_arg0 = is_forward ? ret[0] : ret[1];
+            auto concat_arg1 = is_forward ? ret[1] : ret[0];
+            prog.replace_instruction(ins, op::concat{0}, concat_arg0, concat_arg1);
+        }
+    }
+
+    // replace the corresponding rnn_last_output instruction
+    // with the last_output, if rnn_last_output exists
+    // while loop to handle case of multiple rnn_last_output operators
+    auto last_output_it = ins->outputs().begin();
+    while(last_output_it != ins->outputs().end())
+    {
+        last_output_it = std::find_if(last_output_it, ins->outputs().end(), [](auto i) {
+            return i->name() == "rnn_last_output";
+        });
+
+        if(last_output_it != ins->outputs().end())
+        {
+            prog.replace_instruction(*last_output_it, last_output);
+            last_output_it++;
+        }
+    }
+}
+
+std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
+                                                   program& prog,
+                                                   instruction_ref ins,
+                                                   std::vector<instruction_ref> inputs,
+                                                   int linear_before_reset,
+                                                   const operation& actv_func1,
+                                                   const operation& actv_func2) const
+{
+    assert(inputs.size() == 5);
+    auto seq  = inputs.at(0);
+    auto w    = inputs.at(1);
+    auto r    = inputs.at(2);
+    auto bias = inputs.at(3);
+    auto ih   = inputs.at(4);
+
+    instruction_ref hidden_states = prog.end();
+    instruction_ref last_output{};
+    migraphx::shape seq_shape = seq->get_shape();
+    migraphx::shape r_shape   = r->get_shape();
+    long seq_len              = static_cast<long>(seq_shape.lens()[0]);
+    long hs                   = static_cast<long>(r_shape.lens()[2]);
+
+    migraphx::shape s(seq_shape.type(), {seq_shape.lens()[1], r_shape.lens()[2]});
+    std::vector<int> data(s.elements(), 1);
+    auto l1 = prog.add_literal(migraphx::literal{s, data});
+
+    // weight matrix
+    std::vector<int64_t> perm{1, 0};
+    auto sw      = prog.insert_instruction(ins, op::squeeze{{0}}, w);
+    auto wz      = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sw);
+    auto tran_wz = prog.insert_instruction(ins, op::transpose{perm}, wz);
+
+    auto wr      = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sw);
+    auto tran_wr = prog.insert_instruction(ins, op::transpose{perm}, wr);
+
+    auto wh      = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sw);
+    auto tran_wh = prog.insert_instruction(ins, op::transpose{perm}, wh);
+
+    auto sr      = prog.insert_instruction(ins, op::squeeze{{0}}, r);
+    auto rz      = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sr);
+    auto tran_rz = prog.insert_instruction(ins, op::transpose{perm}, rz);
+
+    auto rr      = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sr);
+    auto tran_rr = prog.insert_instruction(ins, op::transpose{perm}, rr);
+
+    auto rh      = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sr);
+    auto tran_rh = prog.insert_instruction(ins, op::transpose{perm}, rh);
+
+    // initial states
+    auto sih = prog.insert_instruction(ins, op::squeeze{{0}}, ih);
+
+    // bias
+    instruction_ref brcst_bz{};
+    instruction_ref brcst_br{};
+    instruction_ref brcst_wbh{};
+    instruction_ref brcst_rbh{};
+    instruction_ref brcst_bh{};
+    if(bias != prog.end())
+    {
+        auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
+        auto wbz   = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
+        auto wbr   = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
+        auto wbh   = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sbias);
+        brcst_wbh  = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, wbh);
+
+        auto rbz  = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sbias);
+        auto rbr  = prog.insert_instruction(ins, op::slice{{0}, {4 * hs}, {5 * hs}}, sbias);
+        auto rbh  = prog.insert_instruction(ins, op::slice{{0}, {5 * hs}, {6 * hs}}, sbias);
+        brcst_rbh = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, rbh);
+
+        auto bz  = prog.insert_instruction(ins, op::add{}, wbz, rbz);
+        brcst_bz = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, bz);
+
+        auto br  = prog.insert_instruction(ins, op::add{}, wbr, rbr);
+        brcst_br = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, br);
+
+        auto bh  = prog.insert_instruction(ins, op::add{}, wbh, rbh);
+        brcst_bh = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape()}, bh);
+    }
+
+    for(long i = 0; i < seq_len; i++)
+    {
+        long seq_index = is_forward ? i : (seq_len - 1 - i);
+        auto xt = prog.insert_instruction(ins, op::slice{{0}, {seq_index}, {seq_index + 1}}, seq);
+        xt      = prog.insert_instruction(ins, op::squeeze{{0}}, xt);
+
+        // equation f(xt*(Wz^T) + Ht-1 * (Rz^T) + Wbz + Rbz)
+        auto xt_wz = prog.insert_instruction(ins, op::dot{}, xt, tran_wz);
+        auto ht_rz = prog.insert_instruction(ins, op::dot{}, sih, tran_rz);
+        auto xht_z = prog.insert_instruction(ins, op::add{}, xt_wz, ht_rz);
+        if(bias != prog.end())
+        {
+            xht_z = prog.insert_instruction(ins, op::add{}, xht_z, brcst_bz);
+        }
+        auto zt = prog.insert_instruction(ins, actv_func1, xht_z);
+
+        // equation f(Xt*(Wr^T) + Ht-1*(Rr^T) + Wbr + Rbr)
+        auto xt_wr = prog.insert_instruction(ins, op::dot{}, xt, tran_wr);
+        auto ht_rr = prog.insert_instruction(ins, op::dot{}, sih, tran_rr);
+        auto xht_r = prog.insert_instruction(ins, op::add{}, xt_wr, ht_rr);
+        if(bias != prog.end())
+        {
+            xht_r = prog.insert_instruction(ins, op::add{}, xht_r, brcst_br);
+        }
+        auto rt = prog.insert_instruction(ins, actv_func1, xht_r);
+
+        instruction_ref xht_h;
+        if(linear_before_reset == 0)
+        {
+            // equation g(Xt*(Wh^T) + (rt (.) Ht-1)*(Rh^T) + Rbh + Wbh)
+            auto xt_wh  = prog.insert_instruction(ins, op::dot{}, xt, tran_wh);
+            auto rt_ht1 = prog.insert_instruction(ins, op::mul{}, rt, sih);
+            auto rt_rh  = prog.insert_instruction(ins, op::dot{}, rt_ht1, tran_rh);
+            xht_h       = prog.insert_instruction(ins, op::add{}, xt_wh, rt_rh);
+            if(bias != prog.end())
+            {
+                xht_h = prog.insert_instruction(ins, op::add{}, xht_h, brcst_bh);
+            }
+        }
+        else
+        {
+            // equation ht = g(Xt*(Wh^T) + (rt (.) (Ht-1*(Rh^T) + Rbh)) + Wbh)
+            auto xt_wh  = prog.insert_instruction(ins, op::dot{}, xt, tran_wh);
+            auto ht1_rh = prog.insert_instruction(ins, op::dot{}, sih, tran_rh);
+            if(bias != prog.end())
+            {
+                ht1_rh = prog.insert_instruction(ins, op::add{}, ht1_rh, brcst_rbh);
+            }
+            auto rt_rh = prog.insert_instruction(ins, op::mul{}, rt, ht1_rh);
+            xht_h      = prog.insert_instruction(ins, op::add{}, xt_wh, rt_rh);
+            if(bias != prog.end())
+            {
+                xht_h = prog.insert_instruction(ins, op::add{}, xht_h, brcst_wbh);
+            }
+        }
+        auto ht = prog.insert_instruction(ins, actv_func2, xht_h);
+
+        // equation Ht = (1 - zt) (.) ht + zt (.) Ht-1
+        auto one_minus_zt    = prog.insert_instruction(ins, op::sub{}, l1, zt);
+        auto one_minus_zt_ht = prog.insert_instruction(ins, op::mul{}, one_minus_zt, ht);
+        auto zt_ht1          = prog.insert_instruction(ins, op::mul{}, zt, sih);
+        sih                  = prog.insert_instruction(ins, op::add{}, one_minus_zt_ht, zt_ht1);
+        last_output          = prog.insert_instruction(ins, op::unsqueeze{{0, 1}}, sih);
+
+        if(i < seq_len - 1)
+        {
+            if(is_forward)
+            {
+                hidden_states =
+                    (seq_index == 0)
+                        ? last_output
+                        : prog.insert_instruction(ins, op::concat{0}, hidden_states, last_output);
+            }
+            else
+            {
+                hidden_states =
+                    (seq_index == seq_len - 1)
+                        ? last_output
+                        : prog.insert_instruction(ins, op::concat{0}, last_output, hidden_states);
+            }
+        }
+    }
+
+    return {hidden_states, last_output};
+}
+
+std::vector<operation> rewrite_rnn::gru_actv_funcs(instruction_ref ins) const
+{
+    auto gru_op = any_cast<op::gru>(ins->get_operator());
+    // before rewrite the gru operator, need to ensure
+    // we have 4 actv funcs, even though a user does not
+    // specifiy any actv func. If less than 4, use the
+    // algorithm in parse_gru to make 4 actv functions
+    if(gru_op.direction == op::rnn_direction::bidirectional)
+    {
+        if(gru_op.actv_funcs.empty())
+            return {op::sigmoid{}, op::tanh{}, op::sigmoid{}, op::tanh{}};
+        else if(gru_op.actv_funcs.size() == 1)
+            return {gru_op.actv_funcs.at(0),
+                    gru_op.actv_funcs.at(0),
+                    gru_op.actv_funcs.at(0),
+                    gru_op.actv_funcs.at(0)};
+        else if(gru_op.actv_funcs.size() == 2)
+            return {gru_op.actv_funcs.at(0),
+                    gru_op.actv_funcs.at(1),
+                    gru_op.actv_funcs.at(0),
+                    gru_op.actv_funcs.at(1)};
+        else if(gru_op.actv_funcs.size() == 3)
+            return {gru_op.actv_funcs.at(0),
+                    gru_op.actv_funcs.at(1),
+                    gru_op.actv_funcs.at(2),
+                    gru_op.actv_funcs.at(0)};
+        else
+            return gru_op.actv_funcs;
+    }
+    else
+    {
+        if(gru_op.actv_funcs.empty())
+            return {op::sigmoid{}, op::tanh{}};
+        else if(gru_op.actv_funcs.size() == 1)
+            return {gru_op.actv_funcs.at(0), gru_op.actv_funcs.at(0)};
+        else
+            return gru_op.actv_funcs;
+    }
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -9,65 +9,89 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-// Reshapers that can't handle nonstandard input shapes
-bool is_nonstandard_reshaper(instruction_ref ins)
-{
-    // clang-format off
-    static const std::unordered_set<std::string> names = {
-        "reshape"
-    };
-    // clang-format on
-    return contains(names, ins->name()) and ins->inputs().front()->name() == "contiguous";
-}
-
 bool is_reshaper(instruction_ref ins)
 {
    // clang-format off
    static const std::unordered_set<std::string> names = {
        "reshape",
-        "transpose",
-        // "broadcast",
        "contiguous"
    };
    // clang-format on
-    return contains(names, ins->name()) and not is_nonstandard_reshaper(ins);
+    return contains(names, ins->name());
+}
+
+bool is_transpose_output(instruction_ref ins)
+{
+    if(ins->outputs().size() != 1)
+        return false;
+    if(ins->outputs().front()->name() == "contiguous")
+        return is_transpose_output(ins->outputs().front());
+    return ins->outputs().front()->name() == "transpose";
+}
+
+instruction_ref find_transpose_input(instruction_ref ins)
+{
+    if(ins->inputs().size() != 1)
+        return ins;
+    if(ins->inputs().front()->name() == "contiguous")
+        return find_transpose_input(ins->inputs().front());
+    if(ins->inputs().front()->name() == "transpose")
+        return ins->inputs().front();
+    return ins;
 }

 void simplify_reshapes::apply(program& p) const
 {
+    auto end = std::prev(p.end());
    for(auto ins : iterator_for(p))
    {
-        if(not is_reshaper(ins))
-            continue;
-        if(ins->outputs().size() != 1)
+        if(ins->outputs().empty() and ins != end)
            continue;
-        if(is_reshaper(ins->outputs().front()))
-            continue;
-        // Gather reshapes
-        std::vector<instruction_ref> reshapes{ins};
-        while(is_reshaper(reshapes.back()))
+        if(is_reshaper(ins))
        {
-            assert(!reshapes.back()->inputs().empty());
-            assert(p.has_instruction(reshapes.back()->inputs().front()));
-            auto input = reshapes.back()->inputs().front();
-            reshapes.push_back(input);
-        }
+            if(std::any_of(ins->outputs().begin(), ins->outputs().end(), &is_reshaper))
+                continue;
+            // Gather reshapes
+            std::vector<instruction_ref> reshapes{ins};
+            while(is_reshaper(reshapes.back()))
+            {
+                assert(!reshapes.back()->inputs().empty());
+                assert(p.has_instruction(reshapes.back()->inputs().front()));
+                auto input = reshapes.back()->inputs().front();
+                reshapes.push_back(input);
+            }

-        std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
-        for(auto start : iterator_for(reshapes))
-        {
-            auto last = std::find_if(reshapes.rbegin(), reshapes.rend(), [&](auto&& i) {
-                return i->get_shape() == (*start)->get_shape() and i != (*start);
-            });
-            if(last != reshapes.rend())
+            std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
+            for(auto start : iterator_for(reshapes))
            {
-                r = std::make_pair(*start, *last);
-                break;
+                auto last = std::find_if(reshapes.rbegin(), reshapes.rend(), [&](auto&& i) {
+                    return i->get_shape() == (*start)->get_shape() and i != (*start);
+                });
+                if(last != reshapes.rend())
+                {
+                    r = std::make_pair(*start, *last);
+                    break;
+                }
+            }
+            if(r.first != r.second)
+            {
+                p.replace_instruction(r.first, r.second);
            }
        }
-        if(r.first != r.second)
+        else if(ins->name() == "transpose")
        {
-            p.replace_instruction(r.first, r.second);
+            if(is_transpose_output(ins))
+                continue;
+            auto x = ins;
+            auto t = ins;
+            do
+            {
+                x = t;
+                t = find_transpose_input(x);
+            } while(x != t and t->name() == "transpose");
+            if(t == ins or t->name() != "transpose")
+                continue;
+            p.replace_instruction(ins, t->inputs().front());
        }
    }
    // Replace all reshapes with as_shape

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -103,6 +103,43 @@ struct cpu_batch_norm_inference
    }
 };

+struct cpu_lrn
+{
+    op::lrn op;
+
+    std::string name() const { return "cpu::lrn"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            int n_batch         = output_shape.lens()[0];
+            int channels        = output_shape.lens()[1];
+            int height          = output_shape.lens()[2];
+            int width           = output_shape.lens()[3];
+            float alphaoverarea = op.alpha / op.size;
+            int radius          = (op.size - 1) / 2;
+
+            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
+                float scale = 0;
+                dfor(channels)([&](int c) {
+                    auto start = (c - radius) < 0 ? 0 : (c - radius);
+                    auto end   = (c + radius) > channels ? channels : (c + radius);
+                    for(auto k = start; k < end; ++k)
+                    {
+                        scale += std::pow(input(b, k, h, w), 2);
+                    }
+                    scale *= alphaoverarea;
+                    scale += op.bias;
+                    scale              = std::pow(scale, -op.beta);
+                    output(b, c, h, w) = input(b, c, h, w) * scale;
+                });
+            });
+        });
+        return result;
+    }
+};
+
 struct cpu_convolution
 {
    op::convolution op;
@@ -287,14 +324,7 @@ struct cpu_contiguous
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
-        assert(output_shape.standard());
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            shape_for_each(output.get_shape(), [&](const auto& idx) {
-                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());
-            });
-        });
-        return result;
+        return op.compute(output_shape, std::move(args));
    }
 };

@@ -688,6 +718,7 @@ struct cpu_apply
        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
+        apply_map["lrn"]        = extend_op<cpu_lrn, op::lrn>();
        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
        apply_map["pad"]        = extend_op<cpu_pad, op::pad>();
        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();

--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -2,6 +2,8 @@
 #include <migraphx/cpu/target.hpp>
 #include <migraphx/cpu/lowering.hpp>
 #include <migraphx/auto_contiguous.hpp>
+#include <migraphx/rewrite_rnn.hpp>
+#include <migraphx/dead_code_elimination.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -11,7 +13,11 @@ std::string target::name() const { return "cpu"; }

 std::vector<pass> target::get_passes(migraphx::context&) const
 {
-    return {auto_contiguous{}, lowering{}};
+    return {auto_contiguous{},
+            rewrite_rnn{},
+            dead_code_elimination{},
+            lowering{},
+            dead_code_elimination{}};
 }

 } // namespace cpu

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -30,6 +30,7 @@ add_library(migraphx_device
    device/concat.cpp
    device/pad.cpp
    device/gather.cpp
+    device/sub.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
@@ -60,6 +61,7 @@ add_library(migraphx_gpu
    elu.cpp
    pad.cpp
    gather.cpp
+    lrn.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/device/sub.cpp
+++ b/src/targets/gpu/device/sub.cpp
+#include <migraphx/gpu/device/sub.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void sub(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)([](auto x, auto y) { return y - x; });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/gemm.cpp
+++ b/src/targets/gpu/gemm.cpp
@@ -107,6 +107,7 @@ argument miopen_gemm::compute(context& ctx,
                             ldc);

    });
+
    return args[2];
 }


--- a/src/targets/gpu/include/migraphx/gpu/device/sub.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/sub.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_SUB_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_SUB_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void sub(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/lrn.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/lrn.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_LRN_HPP
+#define MIGRAPHX_GUARD_RTGLIB_LRN_HPP
+
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/contiguous.hpp>
+#include <migraphx/gpu/device/add.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct miopen_lrn
+{
+    shared<lrn_descriptor> ldesc;
+    std::string name() const { return "gpu::lrn"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif