Merge branch 'develop' into tests

20b1d690 · Paul · 17aaaa1e · ba729cfc · 20b1d690 · 20b1d690
Commit 20b1d690 authored Sep 20, 2019 by Paul
20 changed files
--- a/src/include/migraphx/target.hpp
+++ b/src/include/migraphx/target.hpp
@@ -11,6 +11,8 @@
 #include <migraphx/context.hpp>
 #include <migraphx/pass.hpp>
 #include <migraphx/config.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/rank.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -34,10 +36,86 @@ struct target
     * @return The context to be used during compilation and execution.
     */
    context get_context() const;
+    /**
+     * @brief copy an argument to the current target.
+     *
+     * @param arg Input argument to be copied to the target
+     * @return Argument in the target.
+     */
+    argument copy_to(const argument& arg) const;
+    /**
+     * @brief copy an argument from the current target.
+     *
+     * @param arg Input argument to be copied from the target
+     * @return Argument in the host.
+     */
+    argument copy_from(const argument& arg) const;
+    /**
+     * @brief Allocate an argument based on the input shape
+     *
+     * @param s Shape of the argument to be allocated in the target
+     * @return Allocated argument in the target.
+     */
+    argument allocate(const shape& s) const;
 };

 #else

+template <class T>
+auto target_allocate(rank<1>, T& x, const shape& s) -> decltype(x.allocate(s))
+{
+    return x.allocate(s);
+}
+
+template <class T>
+argument target_allocate(rank<0>, T& x, const shape&)
+{
+    std::string name = x.name();
+    MIGRAPHX_THROW("Not computable: " + name);
+}
+
+template <class T>
+argument target_allocate(T& x, const shape& s)
+{
+    return target_allocate(rank<1>{}, x, s);
+}
+
+template <class T>
+auto copy_to_target(rank<1>, T& x, const argument& arg) -> decltype(x.copy_to(arg))
+{
+    return x.copy_to(arg);
+}
+
+template <class T>
+argument copy_to_target(rank<0>, T&, const argument& arg)
+{
+    return arg;
+}
+
+template <class T>
+argument copy_to_target(T& x, const argument& arg)
+{
+    return copy_to_target(rank<1>{}, x, arg);
+}
+
+template <class T>
+auto copy_from_target(rank<1>, T& x, const argument& arg) -> decltype(x.copy_from(arg))
+{
+    return x.copy_from(arg);
+}
+
+template <class T>
+argument copy_from_target(rank<0>, T&, const argument& arg)
+{
+    return arg;
+}
+
+template <class T>
+argument copy_from_target(T& x, const argument& arg)
+{
+    return copy_from_target(rank<1>{}, x, arg);
+}
+
 /*
 * Type-erased interface for:
 *
@@ -46,6 +124,9 @@ struct target
 *      std::string name() const;
 *      std::vector<pass> get_passes(context& ctx) const;
 *      context get_context() const;
+ *      argument copy_to(const argument& input) const;
+ *      argument copy_from(const argument& input) const;
+ *      argument allocate(const shape& s) const;
 * };
 *
 */
@@ -125,6 +206,24 @@ struct target
        return (*this).private_detail_te_get_handle().get_context();
    }

+    argument copy_to(const argument& input) const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().copy_to(input);
+    }
+
+    argument copy_from(const argument& input) const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().copy_from(input);
+    }
+
+    argument allocate(const shape& s) const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().allocate(s);
+    }
+
    friend bool is_shared(const target& private_detail_x, const target& private_detail_y)
    {
        return private_detail_x.private_detail_te_handle_mem_var ==
@@ -141,6 +240,9 @@ struct target
        virtual std::string name() const                         = 0;
        virtual std::vector<pass> get_passes(context& ctx) const = 0;
        virtual context get_context() const                      = 0;
+        virtual argument copy_to(const argument& input) const    = 0;
+        virtual argument copy_from(const argument& input) const  = 0;
+        virtual argument allocate(const shape& s) const          = 0;
    };

    template <typename PrivateDetailTypeErasedT>
@@ -181,6 +283,24 @@ struct target

        context get_context() const override { return private_detail_te_value.get_context(); }

+        argument copy_to(const argument& input) const override
+        {
+
+            return copy_to_target(private_detail_te_value, input);
+        }
+
+        argument copy_from(const argument& input) const override
+        {
+
+            return copy_from_target(private_detail_te_value, input);
+        }
+
+        argument allocate(const shape& s) const override
+        {
+
+            return target_allocate(private_detail_te_value, s);
+        }
+
        PrivateDetailTypeErasedT private_detail_te_value;
    };


--- a/src/include/migraphx/tensor_view.hpp
+++ b/src/include/migraphx/tensor_view.hpp
@@ -132,7 +132,11 @@ struct tensor_view
            return m_data + this->size();
    }

-    std::vector<T> to_vector() const { return std::vector<T>(this->begin(), this->end()); }
+    template <class U = T>
+    std::vector<U> to_vector() const
+    {
+        return std::vector<U>(this->begin(), this->end());
+    }

    friend std::ostream& operator<<(std::ostream& os, const tensor_view<T>& x)
    {

--- a/src/include/migraphx/verify.hpp
+++ b/src/include/migraphx/verify.hpp
@@ -168,6 +168,7 @@ bool verify_range(R1&& r1, R2&& r2, double tolerance = 80, double* out_error = n
 {
    double threshold = std::numeric_limits<range_value<R1>>::epsilon() * tolerance;
    auto error       = rms_range(r1, r2);
+    // cppcheck-suppress uninitvar
    if(out_error != nullptr)
        *out_error = error;
    return error <= threshold;

--- a/src/onnx/CMakeLists.txt
+++ b/src/onnx/CMakeLists.txt
@@ -9,6 +9,7 @@ set_target_properties(onnx-proto PROPERTIES POSITION_INDEPENDENT_CODE On)

 add_library(migraphx_onnx onnx.cpp)
 set_target_properties(migraphx_onnx PROPERTIES EXPORT_NAME onnx)
+rocm_set_soversion(migraphx_onnx ${PROJECT_VERSION})
 rocm_clang_tidy_check(migraphx_onnx)
 target_link_libraries(migraphx_onnx PRIVATE onnx-proto)
 target_link_libraries(migraphx_onnx PUBLIC migraphx)
@@ -19,7 +20,7 @@ rocm_install_targets(

 add_executable(read_onnx read_onnx.cpp)
 rocm_clang_tidy_check(read_onnx)
-target_link_libraries(read_onnx migraphx_onnx)
+target_link_libraries(read_onnx migraphx_cpu migraphx_onnx)


 if(MIGRAPHX_ENABLE_GPU)

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -40,6 +40,7 @@ struct onnx_parser
        add_generic_op("Sigmoid", op::sigmoid{});
        add_generic_op("Abs", op::abs{});
        add_generic_op("Exp", op::exp{});
+        add_generic_op("Erf", op::erf{});
        add_generic_op("Log", op::log{});
        // disable dropout for inference
        add_generic_op("Dropout", op::identity{});
@@ -53,21 +54,29 @@ struct onnx_parser
        add_generic_op("Asin", op::asin{});
        add_generic_op("Acos", op::acos{});
        add_generic_op("Atan", op::atan{});
+        add_generic_op("Sqrt", op::sqrt{});
+        add_generic_op("Round", op::round{});
+        add_generic_op("Sign", op::sign{});

        add_binary_op("Add", op::add{});
        add_binary_op("Div", op::div{});
        add_binary_op("Mul", op::mul{});
        add_binary_op("Sub", op::sub{});
+        add_binary_op("Pow", op::pow{});

        add_variadic_op("Sum", op::add{});
        add_variadic_op("Max", op::max{});
        add_variadic_op("Min", op::min{});

+        add_mem_op("ArgMax", &onnx_parser::parse_arg_op<op::argmax>);
+        add_mem_op("ArgMin", &onnx_parser::parse_arg_op<op::argmin>);
+        add_mem_op("Cast", &onnx_parser::parse_cast);
        add_mem_op("Clip", &onnx_parser::parse_clip);
        add_mem_op("LRN", &onnx_parser::parse_lrn);
        add_mem_op("ImageScaler", &onnx_parser::parse_imagescaler);
        add_mem_op("LeakyRelu", &onnx_parser::parse_leaky_relu);
        add_mem_op("Elu", &onnx_parser::parse_elu);
+        add_mem_op("Expand", &onnx_parser::parse_expand);
        add_mem_op("Constant", &onnx_parser::parse_constant);
        add_mem_op("Conv", &onnx_parser::parse_conv);
        add_mem_op("MaxPool", &onnx_parser::parse_pooling);
@@ -79,8 +88,8 @@ struct onnx_parser
        add_mem_op("Gemm", &onnx_parser::parse_gemm);
        add_mem_op("MatMul", &onnx_parser::parse_matmul);
        add_mem_op("BatchNormalization", &onnx_parser::parse_batchnorm);
-        add_mem_op("Softmax", &onnx_parser::parse_softmax);
-        add_mem_op("LogSoftmax", &onnx_parser::parse_logsoftmax);
+        add_mem_op("Softmax", &onnx_parser::parse_softmax<op::softmax>);
+        add_mem_op("LogSoftmax", &onnx_parser::parse_softmax<op::logsoftmax>);
        add_mem_op("Squeeze", &onnx_parser::parse_squeeze);
        add_mem_op("Unsqueeze", &onnx_parser::parse_unsqueeze);
        add_mem_op("Slice", &onnx_parser::parse_slice);
@@ -88,11 +97,14 @@ struct onnx_parser
        add_mem_op("Gather", &onnx_parser::parse_gather);
        add_mem_op("Shape", &onnx_parser::parse_shape);
        add_mem_op("ConstantFill", &onnx_parser::parse_constant_fill);
+        add_mem_op("ConstantOfShape", &onnx_parser::parse_constant_of_shape);
        add_mem_op("Transpose", &onnx_parser::parse_transpose);
        add_mem_op("RNN", &onnx_parser::parse_rnn);
        add_mem_op("GRU", &onnx_parser::parse_gru);
        add_mem_op("LSTM", &onnx_parser::parse_lstm);
        add_mem_op("Pad", &onnx_parser::parse_pad);
+        add_mem_op("ReduceSum", &onnx_parser::parse_reduce_oper<op::reduce_sum>);
+        add_mem_op("ReduceMean", &onnx_parser::parse_reduce_oper<op::reduce_mean>);

        // init the activation function map
        init_actv_func();
@@ -100,6 +112,7 @@ struct onnx_parser

    void init_actv_func()
    {
+        // Support name format of all lower case or the first letter capital
        map_actv_funcs.insert(std::make_pair("tanh", op::tanh{}));
        map_actv_funcs.insert(std::make_pair("relu", op::relu{}));
        map_actv_funcs.insert(std::make_pair("sigmoid", op::sigmoid{}));
@@ -181,11 +194,29 @@ struct onnx_parser
                       s0.end(),
                       s1.begin() + offset,
                       out_lens.begin() + offset,
-                       [](auto a, auto b) { return std::max(a, b); });
+                       [&](auto a, auto b) {
+                           if(a != b and a != 1 and b != 1)
+                           {
+                               MIGRAPHX_THROW("COMPUTE_BROADCASTLEN: shape {" +
+                                              to_string_range(s0) + "} and {" +
+                                              to_string_range(s1) + "} mismatch!");
+                           }
+                           return std::max(a, b);
+                       });

        return out_lens;
    }

+    instruction_ref make_contiguous(instruction_ref ins)
+    {
+        if(ins->get_shape().standard())
+        {
+            return ins;
+        }
+
+        return prog.add_instruction(op::contiguous{}, ins);
+    }
+
    template <class T>
    instruction_ref add_broadcastable_binary_op(instruction_ref arg0, instruction_ref arg1, T x)
    {
@@ -242,27 +273,46 @@ struct onnx_parser
        return prog.add_instruction(op, std::move(args));
    }

-    instruction_ref
-    parse_softmax(const std::string&, const attribute_map&, std::vector<instruction_ref> args)
+    template <class Op>
+    instruction_ref parse_softmax(const std::string&,
+                                  const attribute_map& attributes,
+                                  std::vector<instruction_ref> args)
    {
-        auto dims = args.front()->get_shape().lens();
-        auto r =
-            prog.add_instruction(op::reshape{{long(dims[0]), long(dims[1]), 1, 1}}, args.front());
-        auto s = prog.add_instruction(op::softmax{}, r);
-        return prog.add_instruction(op::reshape{{long(dims[0]), long(dims[1])}}, s);
+        int axis = 1;
+        if(contains(attributes, "axis"))
+        {
+            axis = parse_value(attributes.at("axis")).at<int>();
+        }
+
+        return prog.add_instruction(Op{axis}, std::move(args));
    }

-    instruction_ref parse_logsoftmax(const std::string&,
-                                     const attribute_map& attributes,
-                                     std::vector<instruction_ref> args)
+    template <class Op>
+    instruction_ref parse_arg_op(const std::string&,
+                                 const attribute_map& attributes,
+                                 std::vector<instruction_ref> args)
    {
-        int axis = 1;
+        int64_t axis = 0;
        if(contains(attributes, "axis"))
        {
-            axis = parse_value(attributes.at("axis")).at<int>();
+            axis = static_cast<int64_t>(parse_value(attributes.at("axis")).at<int>());
        }

-        return prog.add_instruction(op::logsoftmax{axis}, std::move(args));
+        int keep_dims = 1;
+        if(contains(attributes, "keepdims"))
+        {
+            keep_dims = parse_value(attributes.at("keepdims")).at<int>();
+        }
+
+        if(keep_dims == 0)
+        {
+            auto ins = prog.add_instruction(Op{axis}, std::move(args));
+            return prog.add_instruction(op::squeeze{{axis}}, ins);
+        }
+        else
+        {
+            return prog.add_instruction(Op{axis}, std::move(args));
+        }
    }

    instruction_ref
@@ -274,7 +324,11 @@ struct onnx_parser
        {
            if(contains(attributes, "auto_pad"))
            {
-                MIGRAPHX_THROW("auto_pad and padding cannot be specified simultaneously");
+                auto s = attributes["auto_pad"].s();
+                if(contains(attributes, "pads") and to_upper(s) != "NOTSET")
+                {
+                    MIGRAPHX_THROW("auto_pad and padding cannot be specified simultaneously");
+                }
            }
            std::vector<std::int64_t> padding;
            copy(attributes["pads"].ints(), std::back_inserter(padding));
@@ -322,7 +376,7 @@ struct onnx_parser
        if(args.size() == 3)
        {
            uint64_t axis = 1;
-            auto l1       = prog.add_instruction(op, args[0], args[1]);
+            auto l1       = prog.add_instruction(op, l0, args[1]);
            auto l2 = prog.add_instruction(op::broadcast{axis, l1->get_shape().lens()}, args[2]);
            return prog.add_instruction(op::add{}, l1, l2);
        }
@@ -352,7 +406,8 @@ struct onnx_parser
            {
                // insert zeros for pad op (args[0] has 4 dims)
                padding = {0, 0, padding[0], padding[1], 0, 0, padding[2], padding[3]};
-                l0      = prog.add_instruction(op::pad{padding}, l0);
+                l0 = prog.add_instruction(op::pad{padding, std::numeric_limits<float>::lowest()},
+                                          l0);
            }
            else
            {
@@ -393,11 +448,11 @@ struct onnx_parser
        if(args.size() == 2)
        {
            auto s = args[1]->eval();
-            if(s.empty())
-                MIGRAPHX_THROW("Dynamic shape is not supported.");
+            check_arg_empty(s, "Reshape: dynamic shape is not supported");
            s.visit([&](auto v) { copy(v, std::back_inserter(op.dims)); });
        }
-        return prog.add_instruction(op, args[0]);
+
+        return prog.add_instruction(op, make_contiguous(args[0]));
    }

    instruction_ref
@@ -445,23 +500,33 @@ struct onnx_parser
        {
            axis = parse_value(attributes.at("axis")).at<int>();
        }
+
        op::gather op{axis};
-        return prog.add_instruction(op, std::move(args));
+        return prog.add_instruction(op, make_contiguous(args[0]), make_contiguous(args[1]));
    }

    instruction_ref
    parse_slice(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
        op::slice op;
+        std::vector<size_t> dims = args[0]->get_shape().lens();
+        size_t num_dims          = dims.size();
        if(contains(attributes, "axes"))
        {
            literal s = parse_value(attributes.at("axes"));
            s.visit([&](auto v) { copy(v, std::back_inserter(op.axes)); });
        }
+        else
+        {
+            op.axes = std::vector<int64_t>(num_dims);
+            std::iota(op.axes.begin(), op.axes.end(), 0);
+        }
+
+        if(contains(attributes, "ends"))
        {
-            literal s = parse_value(attributes.at("ends"));
-            s.visit([&](auto v) { copy(v, std::back_inserter(op.ends)); });
+            op.ends = get_indices(attributes.at("ends"));
        }
+        if(contains(attributes, "starts"))
        {
            literal s = parse_value(attributes.at("starts"));
            s.visit([&](auto v) { copy(v, std::back_inserter(op.starts)); });
@@ -473,7 +538,13 @@ struct onnx_parser
                                   attribute_map attributes,
                                   const std::vector<instruction_ref>&)
    {
-        literal v     = parse_value(attributes.at("value"));
+        literal v = parse_value(attributes.at("value"));
+        // return empty literal
+        if(v.get_shape().elements() == 0)
+        {
+            return prog.add_literal(literal{});
+        }
+
        auto dim_size = attributes.at("value").t().dims_size();
        // if dim_size is 0, it is a scalar
        if(dim_size == 0)
@@ -604,7 +675,6 @@ struct onnx_parser
        float epsilon                                     = 1e-5f;
        float momentum                                    = 0.9f;
        op::batch_norm_inference::bn_infer_mode_t bn_mode = op::batch_norm_inference::spatial;
-        bool is_test                                      = false;
        if(contains(attributes, "epsilon"))
        {
            epsilon = parse_value(attributes.at("epsilon")).at<float>();
@@ -613,17 +683,12 @@ struct onnx_parser
        {
            momentum = parse_value(attributes.at("momentum")).at<float>();
        }
-        if(contains(attributes, "is_test"))
-        {
-            is_test = parse_value(attributes.at("is_test")).at<uint64_t>() > 0;
-        }
        if(contains(attributes, "spatial"))
        {
            bn_mode = (parse_value(attributes.at("spatial")).at<uint64_t>() > 0)
                          ? op::batch_norm_inference::spatial
                          : op::batch_norm_inference::per_activation;
        }
-        (void)is_test;
        op::batch_norm_inference op{epsilon, momentum, bn_mode};
        return prog.add_instruction(op, std::move(args));
    }
@@ -770,7 +835,7 @@ struct onnx_parser
        {
            dtype = parse_value(attributes.at("dtype")).at<int>();
        }
-        migraphx::shape::type_t type = get_type(dtype);
+        shape::type_t type = get_type(dtype);

        if(contains(attributes, "input_as_shape"))
        {
@@ -801,10 +866,7 @@ struct onnx_parser
            }

            migraphx::argument in = args[0]->eval();
-            if(in.empty())
-            {
-                MIGRAPHX_THROW("ConstantFill: cannot handle dynamic shape as input");
-            }
+            check_arg_empty(in, "ConstantFill: dynamic shape is not supported");

            std::vector<std::size_t> dims;
            in.visit([&](auto input) { dims.assign(input.begin(), input.end()); });
@@ -832,6 +894,73 @@ struct onnx_parser
        }
    }

+    instruction_ref parse_constant_of_shape(const std::string&,
+                                            attribute_map attributes,
+                                            std::vector<instruction_ref> args)
+    {
+        literal l_val{};
+        if(contains(attributes, "value"))
+        {
+            l_val = parse_value(attributes.at("value"));
+            if(l_val.get_shape().elements() != 1)
+            {
+                MIGRAPHX_THROW("ConstantOfShape: attribute value can contain only 1 elements!");
+            }
+        }
+        else
+        {
+            l_val = literal({shape::float_type, {1}, {0}}, {0.0f});
+        }
+
+        // input is empty, output is a scalar
+        auto type = l_val.get_shape().type();
+
+        if(args.empty())
+        {
+            MIGRAPHX_THROW("ConstantOfShape : must have 1 input!");
+        }
+        else
+        {
+            migraphx::shape s;
+            // empty input tensor, output is a scalar
+            if(args[0]->get_shape().elements() == 0)
+            {
+                s = migraphx::shape{type, {1}, {0}};
+            }
+            else
+            {
+                migraphx::argument in = args[0]->eval();
+                check_arg_empty(in, "ConstantOfShape: dynamic shape is not supported");
+
+                std::vector<std::size_t> dims;
+                in.visit([&](auto input) { dims.assign(input.begin(), input.end()); });
+                s = migraphx::shape{type, dims};
+            }
+
+            literal l_out{};
+            l_val.visit([&](auto val) {
+                using val_type = std::remove_cv_t<typename decltype(val)::value_type>;
+                // l_val contains only one element
+                std::vector<val_type> out_vec(s.elements(), *val.begin());
+                l_out = literal(s, out_vec);
+            });
+
+            return prog.add_literal(l_out);
+        }
+    }
+
+    instruction_ref
+    parse_expand(const std::string&, const attribute_map&, std::vector<instruction_ref> args)
+    {
+        auto in_lens             = args[0]->get_shape().lens();
+        migraphx::argument arg_s = args[1]->eval();
+        check_arg_empty(arg_s, "Expand: dynamic shape is not supported");
+        std::vector<std::size_t> dims;
+        arg_s.visit([&](auto input) { dims.assign(input.begin(), input.end()); });
+        auto out_lens = compute_broadcasted_lens(in_lens, dims);
+        return prog.add_instruction(op::multibroadcast{out_lens}, args[0]);
+    }
+
    std::vector<instruction_ref>
    parse_rnn(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
@@ -870,7 +999,9 @@ struct onnx_parser
            auto names = attributes.at("activations").strings();
            vec_names.clear();
            vec_names.resize(names.size());
-            std::copy(names.begin(), names.end(), vec_names.begin());
+            std::transform(names.begin(), names.end(), vec_names.begin(), [](auto name) {
+                return to_lower(name);
+            });
        }

        auto name_it = std::find_if(vec_names.begin(), vec_names.end(), [&](auto& name) {
@@ -894,9 +1025,10 @@ struct onnx_parser
        }

        std::vector<operation> vec_actv_funcs(vec_names.size());
-        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& fn) {
-            return map_actv_funcs[fn];
-        });
+        std::transform(vec_names.begin(),
+                       vec_names.end(),
+                       vec_actv_funcs.begin(),
+                       [&](const auto& fn) { return map_actv_funcs[fn]; });

        // To be added later
        float clip = 0.0;
@@ -961,7 +1093,9 @@ struct onnx_parser
            auto names = attributes.at("activations").strings();
            vec_names.clear();
            vec_names.resize(names.size());
-            std::copy(names.begin(), names.end(), vec_names.begin());
+            std::transform(names.begin(), names.end(), vec_names.begin(), [](auto name) {
+                return to_lower(name);
+            });
        }

        // need 4 activation functions
@@ -1008,9 +1142,10 @@ struct onnx_parser
        }

        std::vector<operation> vec_actv_funcs(vec_names.size());
-        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& name) {
-            return map_actv_funcs[name];
-        });
+        std::transform(vec_names.begin(),
+                       vec_names.end(),
+                       vec_actv_funcs.begin(),
+                       [&](const auto& name) { return map_actv_funcs[name]; });

        float clip = 0.0;
        if(contains(attributes, "clip"))
@@ -1088,7 +1223,9 @@ struct onnx_parser
            auto names = attributes.at("activations").strings();
            vec_names.clear();
            vec_names.resize(names.size());
-            std::copy(names.begin(), names.end(), vec_names.begin());
+            std::transform(names.begin(), names.end(), vec_names.begin(), [](auto name) {
+                return to_lower(name);
+            });
        }

        // need 6 activation functions for bidirectional directions
@@ -1178,9 +1315,10 @@ struct onnx_parser
        }

        std::vector<operation> vec_actv_funcs(vec_names.size());
-        std::transform(vec_names.begin(), vec_names.end(), vec_actv_funcs.begin(), [&](auto& name) {
-            return map_actv_funcs[name];
-        });
+        std::transform(vec_names.begin(),
+                       vec_names.end(),
+                       vec_actv_funcs.begin(),
+                       [&](const auto& name) { return map_actv_funcs[name]; });

        float clip = 0.0;
        if(contains(attributes, "clip"))
@@ -1214,6 +1352,53 @@ struct onnx_parser
        return {hidden_states, last_output, last_cell_output};
    }

+    template <class T>
+    instruction_ref parse_reduce_oper(const std::string&,
+                                      attribute_map attributes,
+                                      std::vector<instruction_ref> args)
+    {
+        std::size_t n_dim = args.front()->get_shape().lens().size();
+
+        // default to reduce over all dimensions
+        std::vector<int64_t> axes(n_dim);
+        std::iota(axes.begin(), axes.end(), 0);
+        if(contains(attributes, "axes"))
+        {
+            axes.clear();
+            auto&& attr_axes = attributes["axes"].ints();
+            axes             = std::vector<int64_t>(attr_axes.begin(), attr_axes.end());
+        }
+
+        int keep_dims = 1;
+        if(contains(attributes, "keepdims"))
+        {
+            keep_dims = parse_value(attributes.at("keepdims")).at<int>();
+        }
+
+        if(keep_dims == 1)
+        {
+            return prog.add_instruction(T{axes}, std::move(args));
+        }
+        else
+        {
+            auto ins = prog.add_instruction(T{axes}, std::move(args));
+            return prog.add_instruction(op::squeeze{axes}, ins);
+        }
+    }
+
+    instruction_ref
+    parse_cast(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
+    {
+        if(!contains(attributes, "to"))
+        {
+            MIGRAPHX_THROW("PARSE_CAST: missing to type attribute!");
+        }
+
+        int to_type        = parse_value(attributes.at("to")).at<int>();
+        shape::type_t type = get_type(to_type);
+        return prog.add_instruction(op::convert{type}, std::move(args));
+    }
+
    void parse_from(std::istream& is)
    {
        onnx::ModelProto model;
@@ -1349,6 +1534,20 @@ struct onnx_parser
        return result;
    }

+    static std::vector<int64_t> get_indices(const onnx::AttributeProto& attr)
+    {
+        std::vector<int64_t> result;
+        literal s = parse_value(attr);
+        s.visit([&](auto v) { copy(v, std::back_inserter(result)); });
+        // Clamp large indices to -1
+        std::replace_if(
+            result.begin(),
+            result.end(),
+            [](auto x) { return x > int64_t{std::numeric_limits<std::int32_t>::max()} / 2; },
+            -1);
+        return result;
+    }
+
    template <class T>
    static literal from_repeated(shape::type_t t, const T& r)
    {
@@ -1360,16 +1559,16 @@ struct onnx_parser
    {
        switch(attr.type())
        {
-        case onnx::AttributeProto::UNDEFINED: return {};
        case onnx::AttributeProto::FLOAT: return literal{attr.f()};
        case onnx::AttributeProto::INT: return literal{attr.i()};
-        case onnx::AttributeProto::STRING: return {};
        case onnx::AttributeProto::TENSOR: return parse_tensor(attr.t());
-        case onnx::AttributeProto::GRAPH: return {};
        case onnx::AttributeProto::FLOATS: return from_repeated(shape::float_type, attr.floats());
        case onnx::AttributeProto::INTS: return from_repeated(shape::int64_type, attr.ints());
-        case onnx::AttributeProto::STRINGS: return {};
-        case onnx::AttributeProto::TENSORS: return {};
+        case onnx::AttributeProto::UNDEFINED:
+        case onnx::AttributeProto::GRAPH:
+        case onnx::AttributeProto::STRING:
+        case onnx::AttributeProto::STRINGS:
+        case onnx::AttributeProto::TENSORS:
        case onnx::AttributeProto::GRAPHS: return {};
        }
        MIGRAPHX_THROW("Invalid attribute type");
@@ -1383,47 +1582,41 @@ struct onnx_parser
            const std::string& s = t.raw_data();
            switch(t.data_type())
            {
-            case onnx::TensorProto::UNDEFINED: throw std::runtime_error("");
            case onnx::TensorProto::FLOAT: return create_literal(shape::float_type, dims, s.data());
-            case onnx::TensorProto::UINT8: throw std::runtime_error("");
-            case onnx::TensorProto::INT8: return create_literal(shape::int32_type, dims, s.data());
-            case onnx::TensorProto::UINT16:
-                return create_literal(shape::int32_type, dims, s.data());
-            case onnx::TensorProto::INT16: return create_literal(shape::int32_type, dims, s.data());
-            case onnx::TensorProto::INT32: return create_literal(shape::int32_type, dims, s.data());
-            case onnx::TensorProto::INT64: return create_literal(shape::int64_type, dims, s.data());
-            case onnx::TensorProto::STRING: throw std::runtime_error("");
-            case onnx::TensorProto::BOOL: return create_literal(shape::int32_type, dims, s.data());
            case onnx::TensorProto::FLOAT16:
                return create_literal(shape::half_type, dims, s.data());
            case onnx::TensorProto::DOUBLE:
                return create_literal(shape::double_type, dims, s.data());
-            case onnx::TensorProto::UINT32: throw std::runtime_error("");
-            case onnx::TensorProto::UINT64: throw std::runtime_error("");
-            case onnx::TensorProto::COMPLEX64: throw std::runtime_error("");
+            case onnx::TensorProto::INT64: return create_literal(shape::int64_type, dims, s.data());
+            case onnx::TensorProto::INT8:
+            case onnx::TensorProto::UINT16:
+            case onnx::TensorProto::INT16:
+            case onnx::TensorProto::INT32:
+            case onnx::TensorProto::BOOL: return create_literal(shape::int32_type, dims, s.data());
+            case onnx::TensorProto::UINT8:
+            case onnx::TensorProto::STRING:
+            case onnx::TensorProto::UNDEFINED:
+            case onnx::TensorProto::UINT32:
+            case onnx::TensorProto::UINT64:
+            case onnx::TensorProto::COMPLEX64:
            case onnx::TensorProto::COMPLEX128: throw std::runtime_error("");
            }
            MIGRAPHX_THROW("Invalid tensor type");
        }
        switch(t.data_type())
        {
-        case onnx::TensorProto::UNDEFINED: throw std::runtime_error("");
-        case onnx::TensorProto::FLOAT:
-            return create_literal(shape::float_type, dims, t.float_data());
-        case onnx::TensorProto::UINT8: throw std::runtime_error("");
        case onnx::TensorProto::INT8:
-            return create_literal(shape::int32_type, dims, t.int32_data());
        case onnx::TensorProto::UINT16:
-            return create_literal(shape::int32_type, dims, t.int32_data());
        case onnx::TensorProto::INT16:
-            return create_literal(shape::int32_type, dims, t.int32_data());
        case onnx::TensorProto::INT32:
+        case onnx::TensorProto::BOOL:
            return create_literal(shape::int32_type, dims, t.int32_data());
        case onnx::TensorProto::INT64:
            return create_literal(shape::int64_type, dims, t.int64_data());
-        case onnx::TensorProto::STRING: throw std::runtime_error("");
-        case onnx::TensorProto::BOOL:
-            return create_literal(shape::int32_type, dims, t.int32_data());
+        case onnx::TensorProto::DOUBLE:
+            return create_literal(shape::double_type, dims, t.double_data());
+        case onnx::TensorProto::FLOAT:
+            return create_literal(shape::float_type, dims, t.float_data());
        case onnx::TensorProto::FLOAT16:
        {
            std::vector<uint16_t> data_uint16(t.int32_data().begin(), t.int32_data().end());
@@ -1434,11 +1627,12 @@ struct onnx_parser
                           [](uint16_t raw_val) { return *reinterpret_cast<half*>(&raw_val); });
            return create_literal(shape::half_type, dims, data_half);
        }
-        case onnx::TensorProto::DOUBLE:
-            return create_literal(shape::double_type, dims, t.double_data());
-        case onnx::TensorProto::UINT32: throw std::runtime_error("");
-        case onnx::TensorProto::UINT64: throw std::runtime_error("");
-        case onnx::TensorProto::COMPLEX64: throw std::runtime_error("");
+        case onnx::TensorProto::UNDEFINED:
+        case onnx::TensorProto::UINT8:
+        case onnx::TensorProto::STRING:
+        case onnx::TensorProto::UINT32:
+        case onnx::TensorProto::UINT64:
+        case onnx::TensorProto::COMPLEX64:
        case onnx::TensorProto::COMPLEX128: throw std::runtime_error("");
        }
        MIGRAPHX_THROW("Invalid tensor type");
@@ -1466,28 +1660,23 @@ struct onnx_parser
        shape::type_t shape_type{};
        switch(t.tensor_type().elem_type())
        {
-        case onnx::TensorProto::UNDEFINED:
-            break; // throw std::runtime_error("Unsupported type UNDEFINED");
        case onnx::TensorProto::FLOAT: shape_type = shape::float_type; break;
-        case onnx::TensorProto::UINT8:
-            break; // throw std::runtime_error("Unsupported type UINT8");
        case onnx::TensorProto::INT8: shape_type = shape::int8_type; break;
        case onnx::TensorProto::UINT16: shape_type = shape::uint16_type; break;
        case onnx::TensorProto::INT16: shape_type = shape::int16_type; break;
        case onnx::TensorProto::INT32: shape_type = shape::int32_type; break;
        case onnx::TensorProto::INT64: shape_type = shape::int64_type; break;
-        case onnx::TensorProto::STRING:
-            break; // throw std::runtime_error("Unsupported type STRING");
-        case onnx::TensorProto::BOOL:
-            break; // throw std::runtime_error("Unsupported type BOOL");
        case onnx::TensorProto::FLOAT16: shape_type = shape::half_type; break;
        case onnx::TensorProto::DOUBLE: shape_type = shape::double_type; break;
        case onnx::TensorProto::UINT32: shape_type = shape::uint32_type; break;
        case onnx::TensorProto::UINT64: shape_type = shape::uint64_type; break;
+        case onnx::TensorProto::UINT8:
+        case onnx::TensorProto::STRING:
+        case onnx::TensorProto::BOOL:
+        case onnx::TensorProto::UNDEFINED:
        case onnx::TensorProto::COMPLEX64:
-            break; // throw std::runtime_error("Unsupported type COMPLEX64");
        case onnx::TensorProto::COMPLEX128:
-            break; // throw std::runtime_error("Unsupported type COMPLEX128");
+            break; // throw std::runtime_error("Unsupported type");
        }
        std::vector<std::size_t> dims;
        auto&& tensor_dims = t.tensor_type().shape().dim();
@@ -1526,6 +1715,14 @@ struct onnx_parser
        }
        }
    }
+
+    void check_arg_empty(const argument& arg, const std::string& msg)
+    {
+        if(arg.empty())
+        {
+            MIGRAPHX_THROW(msg);
+        }
+    }
 };

 program parse_onnx(const std::string& name)

--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -85,6 +85,9 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
            offset += (element_size - (offset % element_size));
        conflict_queue.pop();
    }
+    // when int8 type is used, the offset could be any number
+    // if not 4-byte aligned, miopen int8 convolution can crash
+    offset         = (offset + 3) / 4 * 4;
    segment.offset = offset;
    MIGRAPHX_DEBUG(segment.dump());
    required_bytes = std::max(required_bytes, offset + segment.size);

--- a/src/opt/memory_coloring_impl.hpp
+++ b/src/opt/memory_coloring_impl.hpp
@@ -107,7 +107,7 @@ struct memory_coloring_impl
        return ins->name() == "check_context";
    }

-    static bool is_disjoin(live_range& range1, live_range& range2)
+    static bool is_disjoin(const live_range& range1, const live_range& range2)
    {
        if((range1.size == 0) || (range2.size == 0))
            return false;

--- a/src/pass_manager.cpp
+++ b/src/pass_manager.cpp
@@ -2,7 +2,6 @@
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
 #include <migraphx/target.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/ranges.hpp>

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -241,7 +241,7 @@ instruction_ref program::remove_instructions(instruction_ref first, instruction_
    // TODO: Check every element
    assert(has_instruction(first));
    std::for_each(first, last, [&](instruction& ins) { ins.clear_arguments(); });
-    assert(std::all_of(first, last, [&](instruction& ins) { return ins.outputs().empty(); }));
+    assert(std::all_of(first, last, [&](const instruction& ins) { return ins.outputs().empty(); }));
    return impl->instructions.erase(first, last);
 }


--- a/src/propagate_constant.cpp
+++ b/src/propagate_constant.cpp
@@ -10,8 +10,8 @@ inline namespace MIGRAPHX_INLINE_NS {

 bool skip_propogate(instruction_ref ins)
 {
-    if(ins->name() == "@literal")
-        return true;
+    if(ins->name() == "contiguous")
+        return skip_propogate(ins->inputs().front());
    auto&& s = ins->get_shape();
    if(s.broadcasted() and not s.scalar())
        return true;
@@ -33,7 +33,7 @@ void propagate_constant::apply(program& p) const
                                                         ins->outputs().end());
            for(auto child : children)
            {
-                if(skip_propogate(child))
+                if(child->name() == "@literal" or skip_propogate(child))
                {
                    self(child);
                    continue;

--- a/src/py/CMakeLists.txt
+++ b/src/py/CMakeLists.txt
@@ -12,12 +12,7 @@ if(MIGRAPHX_ENABLE_PYTHON)
        C_VISIBILITY_PRESET hidden
        CXX_VISIBILITY_PRESET hidden
    )
-    if(MIGRAPHX_ENABLE_TF)
-        target_link_libraries(migraphx_py PRIVATE migraphx migraphx_tf migraphx_cpu)
-        target_compile_definitions(migraphx_py PRIVATE -DENABLE_TF)
-    else()
-        target_link_libraries(migraphx_py PRIVATE migraphx migraphx_onnx migraphx_cpu)
-    endif()
+    target_link_libraries(migraphx_py PRIVATE migraphx migraphx_tf migraphx_onnx migraphx_cpu)
    if(MIGRAPHX_ENABLE_GPU)
        target_link_libraries(migraphx_py PRIVATE migraphx_gpu)
        target_compile_definitions(migraphx_py PRIVATE -DHAVE_GPU)

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -6,11 +6,9 @@
 #include <migraphx/generate.hpp>
 #include <migraphx/cpu/target.hpp>
 #include <migraphx/stringutils.hpp>
-#ifdef ENABLE_TF
 #include <migraphx/tf.hpp>
-#else
 #include <migraphx/onnx.hpp>
-#endif
+#include <migraphx/type_name.hpp>

 #ifdef HAVE_GPU
 #include <migraphx/gpu/target.hpp>
@@ -104,8 +102,13 @@ migraphx::shape to_shape(const py::buffer_info& info)
            t = as.type_enum();
            n = sizeof(as());
        }
-
    });
+
+    if(n == 0)
+    {
+        MIGRAPHX_THROW("MIGRAPHX PYTHON: Unsupported data type" + info.format);
+    }
+
    auto strides = info.strides;
    std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto i) -> std::size_t {
        return n > 0 ? i / n : 0;
@@ -153,6 +156,7 @@ PYBIND11_MODULE(migraphx, m)
    py::class_<migraphx::target>(m, "target");

    py::class_<migraphx::program>(m, "program")
+        .def("clone", [](migraphx::program& p) { return *(new migraphx::program(p)); })
        .def("get_parameter_shapes", &migraphx::program::get_parameter_shapes)
        .def("get_shape", &migraphx::program::get_shape)
        .def("compile", [](migraphx::program& p, const migraphx::target& t) { p.compile(t); })
@@ -161,16 +165,13 @@ PYBIND11_MODULE(migraphx, m)
        .def("__ne__", std::not_equal_to<migraphx::program>{})
        .def("__repr__", [](const migraphx::program& p) { return migraphx::to_string(p); });

-#ifdef ENABLE_TF
    m.def("parse_tf",
          &migraphx::parse_tf,
          "Parse tf protobuf (default format is nhwc)",
          py::arg("filename"),
          py::arg("is_nhwc") = true);
-#else
    m.def("parse_onnx", &migraphx::parse_onnx);

-#endif
    m.def("get_target", [](const std::string& name) -> migraphx::target {
        if(name == "cpu")
            return migraphx::cpu::target{};
@@ -182,10 +183,16 @@ PYBIND11_MODULE(migraphx, m)
    });

    m.def("generate_argument", &migraphx::generate_argument, py::arg("s"), py::arg("seed") = 0);
-    m.def("quantize", [](migraphx::program& p, std::vector<std::string>& ins_names) {
-        migraphx::quantize(p, ins_names);
-    });
-    m.def("quantize", [](migraphx::program& p) { migraphx::quantize(p, {"all"}); });
+    m.def("quantize_fp16",
+          &migraphx::quantize_fp16,
+          py::arg("prog"),
+          py::arg("ins_names") = std::vector<std::string>{"all"});
+    m.def("quantize_int8",
+          &migraphx::quantize_int8,
+          py::arg("prog"),
+          py::arg("t"),
+          py::arg("calibration") = std::vector<migraphx::program::parameter_map>{},
+          py::arg("ins_names")   = std::vector<std::string>{"dot", "convolution"});

 #ifdef HAVE_GPU
    m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -3,33 +3,104 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/op/convert.hpp>
+#include <migraphx/op/clip.hpp>
+#include <migraphx/op/round.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/mul.hpp>
+#include <migraphx/op/add.hpp>
+#include <migraphx/op/quant_dot.hpp>
+#include <migraphx/op/capture.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/multibroadcast.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/ranges.hpp>
+#include <migraphx/target.hpp>
 #include <utility>
+#include <set>
+#include <iomanip>
+#include <fstream>
+#include <algorithm>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-instruction_ref insert_fp16(program& prog,
-                            instruction_ref& ins,
-                            shape::type_t type,
-                            std::unordered_map<instruction_ref, instruction_ref>& map_fp16)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_INT8_QUANTIZATION_PARAMS)
+
+instruction_ref insert_quant_ins(program& prog,
+                                 instruction_ref& ins,
+                                 shape::type_t type,
+                                 std::unordered_map<instruction_ref, instruction_ref>& map_ins,
+                                 float scale = 1.0f,
+                                 float shift = 0.0f)
 {
-    if(map_fp16.count(ins) > 0)
+    if(map_ins.count(ins) > 0)
+    {
+        return map_ins[ins];
+    }
+
+    if(ins->name() == "undefined")
+    {
+        return ins;
+    }
+
+    assert(ins->get_shape().type() == shape::float_type or
+           ins->get_shape().type() == shape::double_type or
+           ins->get_shape().type() == shape::int32_type or
+           ins->get_shape().type() == shape::half_type);
+    instruction_ref quant_ins{};
+    auto insert_loc = std::next(ins);
+    if(type == shape::int8_type)
+    {
+        auto scaled_ins = ins;
+        if(scale != 1.0f)
+        {
+            auto float_ins = scaled_ins;
+            if(scaled_ins->get_shape().type() != shape::float_type)
+            {
+                float_ins =
+                    prog.insert_instruction(insert_loc, op::convert{shape::float_type}, scaled_ins);
+            }
+            std::vector<float> vec_scale(scaled_ins->get_shape().elements(), scale);
+            auto l_scale = prog.add_literal(literal(float_ins->get_shape(), vec_scale));
+            scaled_ins   = prog.insert_instruction(insert_loc, op::mul{}, l_scale, float_ins);
+        }
+
+        auto shifted_ins = scaled_ins;
+        if(shift != 0.0f)
+        {
+            auto float_ins = shifted_ins;
+            if(shifted_ins->get_shape().type() != shape::float_type)
+            {
+                float_ins = prog.insert_instruction(
+                    insert_loc, op::convert{shape::float_type}, shifted_ins);
+            }
+            std::vector<float> vec_shift(shifted_ins->get_shape().elements(), shift);
+            auto l_shift = prog.add_literal(literal(float_ins->get_shape(), vec_shift));
+            shifted_ins  = prog.insert_instruction(insert_loc, op::add{}, l_shift, float_ins);
+        }
+
+        auto rounded_ins = prog.insert_instruction(insert_loc, op::round{}, shifted_ins);
+        auto clipped_ins =
+            prog.insert_instruction(insert_loc, op::clip{127.0f, -128.0f}, rounded_ins);
+        quant_ins = prog.insert_instruction(insert_loc, op::convert{type}, clipped_ins);
+    }
+    else
    {
-        return map_fp16[ins];
+        quant_ins = prog.insert_instruction(insert_loc, op::convert{type}, ins);
    }

-    assert(ins->get_shape().type() == shape::float_type ||
-           ins->get_shape().type() == shape::double_type);
-    instruction_ref ins_fp16{};
-    ins_fp16      = prog.insert_instruction(std::next(ins), op::convert{type}, ins);
-    map_fp16[ins] = ins_fp16;
+    map_ins[ins] = quant_ins;

-    return ins_fp16;
+    return quant_ins;
 }

-void quantize(program& prog, const std::vector<std::string>& ins_names)
+// This function is to convert any instructions specified in the input
+// from double or float to float16 by inserting a convert operator.
+// For the conversion, there could be cases of overflowing, but it
+// is very rare in the area of deeping learning, so we just do a
+// truncate of the input to get the fp16.
+void quantize_fp16(program& prog, const std::vector<std::string>& ins_names)
 {
    std::unordered_map<instruction_ref, instruction_ref> map_fp16;
    for(auto ins : iterator_for(prog))
@@ -53,13 +124,14 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
                // if the input is a convert operator, uses its input
                // as its current input
                instruction_ref input_fp16{};
-                if(input->name() == "convert")
+                if(input->name() == "convert" and
+                   input->inputs().front()->get_shape().type() == shape::half_type)
                {
                    input_fp16 = input->inputs().front();
                }
                else
                {
-                    input_fp16 = insert_fp16(prog, input, shape::half_type, map_fp16);
+                    input_fp16 = insert_quant_ins(prog, input, shape::half_type, map_fp16);
                }
                converted_inputs.push_back(input_fp16);
            }
@@ -79,29 +151,390 @@ void quantize(program& prog, const std::vector<std::string>& ins_names)
        auto ins_shape = compute_shape(op, converted_inputs);
        if(ins_shape.type() != orig_type)
        {
-            // insert another convert instruction to convert it back
-            if(ins == std::prev(prog.end()))
+            // check the dead code case to avoid assert
+            bool output_empty = ins->outputs().empty();
+            auto ins_orig_type =
+                prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
+            if(!output_empty)
+            {
+                prog.replace_instruction(ins, ins_orig_type);
+            }
+        }
+
+        prog.replace_instruction(ins, op, converted_inputs);
+    }
+}
+
+static void ins_quantize_int8(program& prog,
+                              instruction_ref ins,
+                              std::vector<instruction_ref>& converted_inputs,
+                              const std::vector<std::pair<float, float>>& ins_quant_params)
+{
+    auto orig_type = ins->get_shape().type();
+    auto inputs    = ins->inputs();
+    if(ins->name() == "dot")
+    {
+        auto dot_op     = any_cast<op::dot>(ins->get_operator());
+        float new_alpha = dot_op.alpha / (ins_quant_params[0].first * ins_quant_params[1].first);
+        float new_beta  = dot_op.beta;
+        // We need additional checking about the quant_alpha value. If
+        // abs(quant_alpha) > 50 (some tmp value set here), we can convert
+        // it to an integer as the new_alpha in the quant_dot
+        float threshold = 50.0f;
+        if(fabs(new_alpha) >= threshold && fabs(new_beta) >= threshold)
+        {
+            int32_t quant_alpha = static_cast<int32_t>(std::round(new_alpha));
+            int32_t quant_beta  = static_cast<int32_t>(std::round(new_beta));
+            if(shape::int32_type == orig_type)
            {
-                prog.add_instruction(op::convert{orig_type}, ins);
+                prog.replace_instruction(
+                    ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
            }
            else
            {
-                // check the dead code case to avoid assert
-                bool output_empty = ins->outputs().empty();
-                auto ins_orig_type =
-                    prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
-                if(!output_empty)
+                auto quant_dot = prog.insert_instruction(
+                    ins, op::quant_dot{quant_alpha, quant_beta}, converted_inputs);
+                prog.replace_instruction(ins, op::convert{orig_type}, quant_dot);
+            }
+        }
+        // either alpha or beta cannot be quantized because of too big
+        // relative rounding error
+        else
+        {
+            if(converted_inputs.size() == 3)
+            {
+                converted_inputs.pop_back();
+            }
+            auto q_dot   = prog.insert_instruction(ins, op::quant_dot{1, 0}, converted_inputs);
+            auto f_dot   = prog.insert_instruction(ins, op::convert{shape::float_type}, q_dot);
+            auto c_shape = q_dot->get_shape();
+            std::vector<float> vec_alpha(c_shape.elements(), new_alpha);
+            auto l_alpha =
+                prog.add_literal(literal({shape::float_type, c_shape.lens()}, vec_alpha));
+
+            if(inputs.size() == 3 and dot_op.beta != 0.0f)
+            {
+                auto alpha_ab = prog.insert_instruction(ins, op::mul{}, l_alpha, f_dot);
+                std::vector<float> vec_beta(c_shape.elements(), dot_op.beta);
+                auto l_beta =
+                    prog.add_literal(literal({shape::float_type, c_shape.lens()}, vec_beta));
+                instruction_ref beta_c{};
+                if(orig_type != shape::float_type)
                {
-                    prog.replace_instruction(ins, ins_orig_type);
+                    auto fp32_c =
+                        prog.insert_instruction(ins, op::convert{shape::float_type}, inputs.back());
+                    beta_c = prog.insert_instruction(ins, op::mul{}, l_beta, fp32_c);
+                }
+                else
+                {
+                    beta_c = prog.insert_instruction(ins, op::mul{}, l_beta, inputs.back());
+                }
+
+                if(orig_type == shape::float_type)
+                {
+                    prog.replace_instruction(ins, op::add{}, alpha_ab, beta_c);
+                }
+                else
+                {
+                    auto f_res = prog.insert_instruction(ins, op::add{}, alpha_ab, beta_c);
+                    prog.replace_instruction(ins, op::convert{orig_type}, f_res);
+                }
+            }
+            else
+            {
+                if(orig_type == shape::float_type)
+                {
+                    prog.replace_instruction(ins, op::mul{}, l_alpha, f_dot);
+                }
+                else
+                {
+                    auto alpha_ab = prog.insert_instruction(ins, op::mul{}, l_alpha, f_dot);
+                    prog.replace_instruction(ins, op::convert{orig_type}, alpha_ab);
                }
            }
        }
+    }
+    else if(ins->name() == "convolution")
+    {
+        // Current MIOpen convolution does not support alpha and beta,
+        // so we need a separate multiply to adjust the output
+        auto conv_op       = any_cast<op::convolution>(ins->get_operator());
+        auto padding       = conv_op.padding;
+        auto stride        = conv_op.stride;
+        auto dilation      = conv_op.dilation;
+        auto padding_mode  = conv_op.padding_mode;
+        auto group         = conv_op.group;
+        auto adjust_factor = 1.0f / (ins_quant_params[0].first * ins_quant_params[1].first);

-        prog.replace_instruction(ins, op, converted_inputs);
+        auto quant_conv = prog.insert_instruction(
+            ins,
+            op::quant_convolution{padding, stride, dilation, padding_mode, group},
+            converted_inputs);
+        float threshold = 50.0f;
+        std::vector<float> vec_factor(quant_conv->get_shape().elements(), adjust_factor);
+        if(quant_conv->get_shape().type() == orig_type and adjust_factor >= threshold)
+        {
+            auto l_factor = prog.add_literal(
+                literal(quant_conv->get_shape(), vec_factor.begin(), vec_factor.end()));
+            prog.replace_instruction(ins, op::mul{}, quant_conv, l_factor);
+        }
+        // convert quant_conv output to float type, multiply the factor and
+        // conver back to original type
+        else
+        {
+            auto float_conv =
+                prog.insert_instruction(ins, op::convert{shape::float_type}, quant_conv);
+            auto l_factor = prog.add_literal(literal(float_conv->get_shape(), vec_factor));
+            if(orig_type == shape::float_type)
+            {
+                prog.replace_instruction(ins, op::mul{}, l_factor, float_conv);
+            }
+            else
+            {
+                auto adjusted_conv = prog.insert_instruction(ins, op::mul{}, l_factor, float_conv);
+                prog.replace_instruction(ins, op::convert{orig_type}, adjusted_conv);
+            }
+        }
+    }
+    else
+    {
+        MIGRAPHX_THROW("QUANTIZE_INT8: does not support operator " + ins->name());
    }
 }

-void quantize(program& prog) { quantize(prog, {"all"}); }
+// int8 quantization is different from fp16 since int8 can only handle value
+// -128 ~ 127. To convert the float or double to int8, we need a scale and
+// a shift, then the convert can be done as v_int8 = fp * scale + shift.
+// To simplify the changes, we consider shift as 0.0f for now.
+void quantize_int8_impl(program& prog,
+                        const std::vector<std::pair<float, float>>& quant_params,
+                        const std::vector<std::string>& ins_names)
+{
+    if(enabled(MIGRAPHX_INT8_QUANTIZATION_PARAMS{}))
+    {
+        for(std::size_t i = 0; i < quant_params.size(); ++i)
+        {
+            auto param = quant_params.at(i);
+            std::cout << "ins_index = " << i << ", scale = " << param.first
+                      << ", shift = " << param.second << std::endl;
+        }
+        std::cout << std::endl;
+    }
+
+    // For now, we only support the int8 quantization of gemm and convolution
+    std::set<std::string> op_names = {"convolution", "dot"};
+    std::set<std::string> input_ins_names(ins_names.begin(), ins_names.end());
+    if(!std::includes(
+           op_names.begin(), op_names.end(), input_ins_names.begin(), input_ins_names.end()))
+    {
+        MIGRAPHX_THROW("QUANTIZE_INT8: only support DOT and CONVOLUTION operation");
+    }
+
+    std::size_t quant_param_index = 0;
+    std::unordered_map<instruction_ref, instruction_ref> map_quant_ins;
+    std::unordered_map<instruction_ref, std::size_t> map_ins_index;
+    for(auto ins : iterator_for(prog))
+    {
+        if(not contains(ins_names, ins->name()))
+        {
+            continue;
+        }
+
+        // for the dot operator, there could be 2 or 3 input arguments
+        // if the 3rd argument is available, convert it to an int32.
+        std::vector<instruction_ref> converted_inputs;
+
+        // process all inputs, if input is a fp32 or fp64, convert it
+        // to a int8 type by adding a convert operator and replace
+        // the operator with the corresponding int8 version
+        auto inputs = ins->inputs();
+        std::vector<std::pair<float, float>> ins_quant_params;
+        for(auto input : inputs)
+        {
+            // calculate the index of each instruction to be quantized
+            std::size_t ins_index =
+                (map_ins_index.count(input) > 0) ? map_ins_index[input] : quant_param_index++;
+            map_ins_index[input] = ins_index;
+
+            auto param = quant_params[map_ins_index[input]];
+            ins_quant_params.push_back(param);
+
+            // In general, the target_type is int8, but for the dot
+            // operation, if it has 3 inputs, then the last one should
+            // be converted to int32_type
+            shape::type_t quant_type = shape::int8_type;
+            if((ins->name() == "dot") and (inputs.size() == 3) and (input == inputs.back()))
+            {
+                quant_type = shape::int32_type;
+            }
+
+            auto s = input->get_shape();
+            if((s.type() == shape::float_type or s.type() == shape::double_type or
+                s.type() == shape::half_type or s.type() == shape::int32_type) and
+               s.type() != quant_type)
+            {
+                // if the input is a convert operator, uses its input
+                // as its current input
+                instruction_ref quant_input{};
+                if(input->name() == "convert" and
+                   input->inputs().front()->get_shape().type() == quant_type)
+                {
+                    quant_input = input->inputs().front();
+                    // the scale in this case is not used, so tune the scale
+                    // to 1.0f for this parameter
+                    ins_quant_params.back() = std::pair<float, float>(1.0f, 0.0f);
+                }
+                else
+                {
+                    quant_input = insert_quant_ins(
+                        prog, input, quant_type, map_quant_ins, param.first, param.second);
+                }
+                converted_inputs.push_back(quant_input);
+            }
+            else
+            {
+                converted_inputs.push_back(input);
+            }
+        }
+
+        // no change for the input, go to the next instruction
+        if(inputs == converted_inputs)
+        {
+            continue;
+        }
+
+        ins_quantize_int8(prog, ins, converted_inputs, ins_quant_params);
+    }
+
+    if(quant_param_index != quant_params.size())
+    {
+        MIGRAPHX_THROW("QUANTIZE_INT8: number of scales does not match");
+    }
+}
+
+void quantize_int8(program& prog,
+                   const target& t,
+                   const std::vector<program::parameter_map>& calibration,
+                   const std::vector<std::string>& ins_names)
+{
+    // insert capture operator
+    auto cap_prog          = prog;
+    auto int8_quant_params = capture_arguments(cap_prog, t, ins_names);
+
+    // use the calibration data to compute the quantization scale
+    cap_prog.compile(t);
+
+    // use all calibration data to run the program to calculate the
+    // quantization scale and shift
+    for(auto&& arg : calibration)
+    {
+        program::parameter_map m;
+        for(auto&& x : cap_prog.get_parameter_shapes())
+        {
+            if(arg.count(x.first) > 0)
+            {
+                assert(x.second == arg.at(x.first).get_shape());
+                m[x.first] = t.copy_to(arg.at(x.first));
+            }
+            else
+            {
+                m[x.first] = t.allocate(x.second);
+            }
+        }
+        cap_prog.eval(m);
+    }
+
+    quantize_int8_impl(prog, *int8_quant_params, ins_names);
+}
+
+// For the input of each input argument, we need to insert a
+// capture operator to compute the scale and shift
+std::size_t capture_arguments(program& prog,
+                              const std::vector<std::string>& ins_names,
+                              const std::function<void(std::size_t, std::vector<argument>)>& func)
+{
+
+    size_t num_quant_params = 0;
+    // the int8 quantization only support dot and convolution
+    std::set<std::string> op_names = {"dot", "convolution"};
+    std::set<std::string> input_ins_names(ins_names.begin(), ins_names.end());
+    if(!std::includes(
+           op_names.begin(), op_names.end(), input_ins_names.begin(), input_ins_names.end()))
+    {
+        MIGRAPHX_THROW("CAPTURE_ARGUMENTS: input operator is not supported");
+    }
+
+    std::unordered_map<instruction_ref, instruction_ref> ins_map;
+    for(auto ins : iterator_for(prog))
+    {
+        if(not contains(ins_names, ins->name()))
+        {
+            continue;
+        }
+
+        auto inputs = ins->inputs();
+        std::vector<instruction_ref> new_args;
+        for(auto input : inputs)
+        {
+            instruction_ref new_ins{};
+            if(ins_map.count(input) > 0)
+            {
+                new_ins = ins_map[input];
+            }
+            else
+            {
+                new_ins = prog.insert_instruction(
+                    std::next(input), op::capture{num_quant_params++, func}, input);
+                ins_map[input] = new_ins;
+            }
+            new_args.push_back(new_ins);
+        }
+        instruction::replace(ins, ins->get_operator(), ins->get_shape(), new_args);
+    }
+
+    return num_quant_params;
+}
+
+std::shared_ptr<std::vector<std::pair<float, float>>>
+capture_arguments_impl(program& prog, const target& t, const std::vector<std::string>& ins_names)
+{
+    std::shared_ptr<std::vector<std::pair<float, float>>> int8_quant_params =
+        std::make_shared<std::vector<std::pair<float, float>>>();
+    std::shared_ptr<std::vector<float>> max_abs_vals = std::make_shared<std::vector<float>>();
+
+    auto calc_quant_params = [int8_quant_params, max_abs_vals, &t](std::size_t ins_index,
+                                                                   std::vector<argument> args) {
+        std::pair<float, float> param_pair{64.0f, 0.0f};
+
+        // scale and shift is need for only int8 type, and we do not
+        // consider shift, so set shift to 0
+        std::vector<float> vec_val;
+        argument arg = t.copy_from(args.front());
+        arg.visit([&](auto output) { vec_val.assign(output.begin(), output.end()); });
+        auto max_val                = *std::max_element(vec_val.begin(), vec_val.end());
+        auto min_val                = *std::min_element(vec_val.begin(), vec_val.end());
+        auto max_abs                = std::max(std::fabs(max_val), std::fabs(min_val));
+        max_abs_vals->at(ins_index) = std::max(max_abs_vals->at(ins_index), max_abs);
+
+        // if all values are 0, no need to do scaling
+        if(max_abs_vals->at(ins_index) == 0.0f)
+        {
+            param_pair.first = 1.0f;
+        }
+        else
+        {
+            param_pair.first = 127.0f / max_abs_vals->at(ins_index);
+        }
+        int8_quant_params->at(ins_index) = param_pair;
+    };
+
+    auto num_params = capture_arguments(prog, ins_names, calc_quant_params);
+
+    int8_quant_params->resize(num_params, std::pair<float, float>(64.0f, 0.0f));
+    max_abs_vals->resize(num_params, 0.0f);
+
+    return int8_quant_params;
+}

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/fwd_conv_batchnorm_rewrite.cpp
+++ b/src/fwd_conv_batchnorm_rewrite.cpp
-#include <migraphx/fwd_conv_batchnorm_rewrite.hpp>
+#include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/op/batch_norm.hpp>
 #include <migraphx/op/broadcast.hpp>
 #include <migraphx/op/add.hpp>
+#include <migraphx/op/mul.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/dfor.hpp>
@@ -11,7 +12,7 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-void fwd_conv_batchnorm_rewrite::apply(program& p) const
+void rewrite_batchnorm::apply(program& p) const
 {
    for(auto ins : iterator_for(p))
    {
@@ -25,46 +26,30 @@ void fwd_conv_batchnorm_rewrite::apply(program& p) const
        if(any_of({gamma, bias, mean, variance}, [](auto arg) { return arg.empty(); }))
            continue;

-        auto conv_ins = ins->inputs()[0];
-        if(conv_ins->name() != "convolution")
-            continue;
-        // Get convolution weights
-        auto weights = conv_ins->inputs()[1]->eval();
-        if(weights.empty())
-            continue;
+        auto s = shape{ins->get_shape().type(), {ins->get_shape().lens()[1]}};
        // Get epsilon
        auto bn_op   = any_cast<op::batch_norm_inference>(ins->get_operator());
        auto epsilon = bn_op.epsilon;
-        // Get convolution op
-        auto conv_op      = conv_ins->get_operator();
-        auto weights_lens = weights.get_shape().lens();
-        auto conv_lens    = conv_ins->get_shape().lens();
-        argument new_weights{weights.get_shape()};
-        argument new_bias{{bias.get_shape().type(), {bias.get_shape().elements()}}};
-        visit_all(weights, gamma, bias, mean, variance, new_weights, new_bias)(
-            [&](auto weights2,
-                auto gamma2,
-                auto bias2,
-                auto mean2,
-                auto variance2,
-                auto new_weights2,
-                auto new_bias2) {
-                dfor(weights_lens[0], weights_lens[1], weights_lens[2], weights_lens[3])(
-                    [&](std::size_t k, std::size_t c, std::size_t h, std::size_t w) {
-                        new_weights2(k, c, h, w) =
-                            gamma2[k] / std::sqrt(variance2[k] + epsilon) * weights2(k, c, h, w);
-                    });
-                dfor(new_bias.get_shape().elements())([&](std::size_t c) {
-                    new_bias2[c] =
-                        bias2[c] - (gamma2[c] * mean2[c] / std::sqrt(variance2[c] + epsilon));
+
+        argument a{s};
+        argument b{s};
+        visit_all(gamma, bias, mean, variance, a, b)(
+            [&](auto gamma2, auto bias2, auto mean2, auto variance2, auto a2, auto b2) {
+                dfor(a.get_shape().elements())(
+                    [&](std::size_t c) { a2[c] = gamma2[c] / std::sqrt(variance2[c] + epsilon); });
+                dfor(b.get_shape().elements())([&](std::size_t c) {
+                    b2[c] = bias2[c] - (gamma2[c] * mean2[c] / std::sqrt(variance2[c] + epsilon));
                });
            });
-        // Replace convolution instruction with updated weights
-        auto l_weights = p.add_literal({weights.get_shape(), new_weights.data()});
-        auto l_bias    = p.add_literal({new_bias.get_shape(), new_bias.data()});
-        auto c = p.replace_instruction(conv_ins, conv_op, {conv_ins->inputs()[0], l_weights});
-        auto b = p.insert_instruction(ins, op::broadcast{1, c->get_shape().lens()}, l_bias);
-        p.replace_instruction(ins, op::add{}, {c, b});
+
+        auto broadcast   = op::broadcast{1, ins->get_shape().lens()};
+        auto a_ins       = p.add_literal({a.get_shape(), a.data()});
+        auto a_broadcast = p.insert_instruction(ins, broadcast, a_ins);
+        auto mul         = p.insert_instruction(ins, op::mul{}, ins->inputs().front(), a_broadcast);
+        auto b_ins       = p.add_literal({b.get_shape(), b.data()});
+        auto b_broadcast = p.insert_instruction(ins, broadcast, b_ins);
+        auto add         = p.insert_instruction(ins, op::add{}, mul, b_broadcast);
+        p.replace_instruction(ins, add);
    }
 }


--- a/src/rewrite_pooling.cpp
+++ b/src/rewrite_pooling.cpp
+#include <migraphx/rewrite_pooling.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/op/pooling.hpp>
+#include <migraphx/op/reshape.hpp>
+#include <migraphx/op/reduce_mean.hpp>
+#include <migraphx/program.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+void rewrite_pooling::apply(program& prog) const
+{
+    for(auto ins : iterator_for(prog))
+    {
+        if(ins->name() != "pooling")
+            continue;
+        if(ins->get_shape().lens().size() != 4)
+            continue;
+        if(ins->inputs().empty())
+            continue;
+        auto&& s  = ins->inputs().front()->get_shape();
+        auto&& op = any_cast<op::pooling>(ins->get_operator());
+        if(op.mode != "average")
+            continue;
+        if(op.padding[0] != 0 and op.padding[1] != 0)
+            continue;
+        if(op.stride[0] != 1 and op.stride[1] != 1)
+            continue;
+        if(s.lens()[2] != op.lengths[0] and s.lens()[3] != op.lengths[1])
+            continue;
+        std::int64_t n = s.lens()[0];
+        std::int64_t c = s.lens()[1];
+        auto reshape =
+            prog.insert_instruction(ins, op::reshape{{n * c, -1}}, ins->inputs().front());
+        auto pooling = prog.insert_instruction(ins, op::reduce_mean{{1}}, reshape);
+        prog.replace_instruction(ins, op::reshape{{n, c, 1, 1}}, pooling);
+    }
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/rewrite_rnn.cpp
+++ b/src/rewrite_rnn.cpp
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/add.hpp>
+#include <migraphx/op/broadcast.hpp>
+#include <migraphx/op/concat.hpp>
+#include <migraphx/op/dot.hpp>
+#include <migraphx/op/gru.hpp>
+#include <migraphx/op/lstm.hpp>
+#include <migraphx/op/mul.hpp>
+#include <migraphx/op/rnn.hpp>
+#include <migraphx/op/rnn_last_output.hpp>
+#include <migraphx/op/slice.hpp>
+#include <migraphx/op/squeeze.hpp>
+#include <migraphx/op/sub.hpp>
+#include <migraphx/op/transpose.hpp>
+#include <migraphx/op/unsqueeze.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/common.hpp>
@@ -204,17 +217,19 @@ std::vector<instruction_ref> rewrite_rnn::vanilla_rnn_cell(bool is_forward,
    auto tran_sr = prog.insert_instruction(ins, op::transpose{perm}, sr);

    // initial hidden state
-    auto sih = prog.insert_instruction(ins, op::squeeze{{0}}, ih);
+    auto sih      = prog.insert_instruction(ins, op::squeeze{{0}}, ih);
+    auto sih_lens = sih->get_shape().lens();

    // bias
+    instruction_ref bb{};
    if(bias != prog.end())
    {
-        long hs    = r->get_shape().lens()[2];
+        long hs    = static_cast<long>(r->get_shape().lens()[2]);
        auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
        auto wb    = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
        auto rb    = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
-        auto b     = prog.insert_instruction(ins, op::add{}, wb, rb);
-        bias       = prog.insert_instruction(ins, op::broadcast{1, sih->get_shape().lens()}, b);
+        auto wrb   = prog.insert_instruction(ins, op::add{}, wb, rb);
+        bb         = prog.insert_instruction(ins, op::broadcast{1, sih_lens}, wrb);
    }

    instruction_ref hidden_out = prog.end();
@@ -228,20 +243,15 @@ std::vector<instruction_ref> rewrite_rnn::vanilla_rnn_cell(bool is_forward,
        xt      = prog.insert_instruction(ins, op::squeeze{{0}}, xt);
        auto xt_wi = prog.insert_instruction(ins, op::dot{}, xt, tran_sw);
        auto ht_ri = prog.insert_instruction(ins, op::dot{}, sih, tran_sr);
-        auto xt_ht = prog.insert_instruction(ins, op::add{}, xt_wi, ht_ri);
-        instruction_ref ht;
        if(bias != prog.end())
        {
-            ht = prog.insert_instruction(ins, op::add{}, xt_ht, bias);
-        }
-        else
-        {
-            ht = xt_ht;
+            xt_wi = prog.insert_instruction(ins, op::add{}, xt_wi, bb);
        }
+        auto xt_ht = prog.insert_instruction(ins, op::add{}, xt_wi, ht_ri);

        // apply activation function
-        ht  = prog.insert_instruction(ins, actv_func, ht);
-        sih = ht;
+        auto ht = prog.insert_instruction(ins, actv_func, xt_ht);
+        sih     = ht;

        // add the dimensions of sequence length (axis 0 for sequence length,
        // axis 1 for num_directions
@@ -485,62 +495,41 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
    long hs                   = static_cast<long>(r_shape.lens()[2]);

    migraphx::shape s(seq_shape.type(), {seq_shape.lens()[1], r_shape.lens()[2]});
-    std::vector<int> data(s.elements(), 1);
+    std::vector<float> data(s.elements(), 1.0f);
    auto l1 = prog.add_literal(migraphx::literal{s, data});

-    // weight matrix
+    // w matrix squeeze to 2-dim and do a transpose
    std::vector<int64_t> perm{1, 0};
-    auto sw      = prog.insert_instruction(ins, op::squeeze{{0}}, w);
-    auto wz      = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sw);
-    auto tran_wz = prog.insert_instruction(ins, op::transpose{perm}, wz);
-
-    auto wr      = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sw);
-    auto tran_wr = prog.insert_instruction(ins, op::transpose{perm}, wr);
+    auto sw = prog.insert_instruction(ins, op::squeeze{{0}}, w);
+    auto tw = prog.insert_instruction(ins, op::transpose{perm}, sw);

-    auto wh      = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sw);
-    auto tran_wh = prog.insert_instruction(ins, op::transpose{perm}, wh);
-
-    auto sr      = prog.insert_instruction(ins, op::squeeze{{0}}, r);
-    auto rz      = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sr);
-    auto tran_rz = prog.insert_instruction(ins, op::transpose{perm}, rz);
+    // r slide to two part, zr and h
+    auto sr   = prog.insert_instruction(ins, op::squeeze{{0}}, r);
+    auto rzr  = prog.insert_instruction(ins, op::slice{{0}, {0}, {2 * hs}}, sr);
+    auto trzr = prog.insert_instruction(ins, op::transpose{perm}, rzr);

-    auto rr      = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sr);
-    auto tran_rr = prog.insert_instruction(ins, op::transpose{perm}, rr);
-
-    auto rh      = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sr);
-    auto tran_rh = prog.insert_instruction(ins, op::transpose{perm}, rh);
+    auto rh  = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sr);
+    auto trh = prog.insert_instruction(ins, op::transpose{perm}, rh);

    // initial states
-    auto sih = prog.insert_instruction(ins, op::squeeze{{0}}, ih);
+    auto sih  = prog.insert_instruction(ins, op::squeeze{{0}}, ih);
+    size_t bs = ih->get_shape().lens()[1];

    // bias
-    instruction_ref brcst_bz{};
-    instruction_ref brcst_br{};
-    instruction_ref brcst_wbh{};
-    instruction_ref brcst_rbh{};
-    instruction_ref brcst_bh{};
+    instruction_ref bwb{};
+    instruction_ref brb_zr{};
+    instruction_ref brb_h{};
    if(bias != prog.end())
    {
-        auto broadcast_lens = sih->get_shape().lens();
-        auto sbias          = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
-        auto wbz            = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
-        auto wbr            = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
-        auto wbh  = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sbias);
-        brcst_wbh = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, wbh);
-
-        auto rbz  = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sbias);
-        auto rbr  = prog.insert_instruction(ins, op::slice{{0}, {4 * hs}, {5 * hs}}, sbias);
-        auto rbh  = prog.insert_instruction(ins, op::slice{{0}, {5 * hs}, {6 * hs}}, sbias);
-        brcst_rbh = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, rbh);
-
-        auto bz  = prog.insert_instruction(ins, op::add{}, wbz, rbz);
-        brcst_bz = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, bz);
-
-        auto br  = prog.insert_instruction(ins, op::add{}, wbr, rbr);
-        brcst_br = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, br);
-
-        auto bh  = prog.insert_instruction(ins, op::add{}, wbh, rbh);
-        brcst_bh = prog.insert_instruction(ins, op::broadcast{1, broadcast_lens}, bh);
+        auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
+        auto wb    = prog.insert_instruction(ins, op::slice{{0}, {0}, {3 * hs}}, sbias);
+        bwb = prog.insert_instruction(ins, op::broadcast{1, {bs, static_cast<size_t>(3 * hs)}}, wb);
+
+        auto rb_zr = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {5 * hs}}, sbias);
+        auto rb_h  = prog.insert_instruction(ins, op::slice{{0}, {5 * hs}, {6 * hs}}, sbias);
+        brb_zr     = prog.insert_instruction(
+            ins, op::broadcast{1, {bs, static_cast<size_t>(2 * hs)}}, rb_zr);
+        brb_h = prog.insert_instruction(ins, op::broadcast{1, {bs, static_cast<size_t>(hs)}}, rb_h);
    }

    for(long i = 0; i < seq_len; i++)
@@ -549,56 +538,51 @@ std::vector<instruction_ref> rewrite_rnn::gru_cell(bool is_forward,
        auto xt = prog.insert_instruction(ins, op::slice{{0}, {seq_index}, {seq_index + 1}}, seq);
        xt      = prog.insert_instruction(ins, op::squeeze{{0}}, xt);

-        // equation f(xt*(Wz^T) + Ht-1 * (Rz^T) + Wbz + Rbz)
-        auto xt_wz = prog.insert_instruction(ins, op::dot{}, xt, tran_wz);
-        auto ht_rz = prog.insert_instruction(ins, op::dot{}, sih, tran_rz);
-        auto xht_z = prog.insert_instruction(ins, op::add{}, xt_wz, ht_rz);
+        auto xt_w    = prog.insert_instruction(ins, op::dot{}, xt, tw);
+        auto ih1_rzr = prog.insert_instruction(ins, op::dot{}, sih, trzr);
        if(bias != prog.end())
        {
-            xht_z = prog.insert_instruction(ins, op::add{}, xht_z, brcst_bz);
+            xt_w    = prog.insert_instruction(ins, op::add{}, xt_w, bwb);
+            ih1_rzr = prog.insert_instruction(ins, op::add{}, ih1_rzr, brb_zr);
        }
-        auto zt = prog.insert_instruction(ins, actv_func1, xht_z);

-        // equation f(Xt*(Wr^T) + Ht-1*(Rr^T) + Wbr + Rbr)
-        auto xt_wr = prog.insert_instruction(ins, op::dot{}, xt, tran_wr);
-        auto ht_rr = prog.insert_instruction(ins, op::dot{}, sih, tran_rr);
-        auto xht_r = prog.insert_instruction(ins, op::add{}, xt_wr, ht_rr);
-        if(bias != prog.end())
-        {
-            xht_r = prog.insert_instruction(ins, op::add{}, xht_r, brcst_br);
-        }
-        auto rt = prog.insert_instruction(ins, actv_func1, xht_r);
+        auto xw_z = prog.insert_instruction(ins, op::slice{{1}, {0}, {hs}}, xt_w);
+        auto xw_r = prog.insert_instruction(ins, op::slice{{1}, {hs}, {2 * hs}}, xt_w);
+        auto xw_h = prog.insert_instruction(ins, op::slice{{1}, {2 * hs}, {3 * hs}}, xt_w);
+
+        auto hr_z = prog.insert_instruction(ins, op::slice{{1}, {0}, {hs}}, ih1_rzr);
+        auto hr_r = prog.insert_instruction(ins, op::slice{{1}, {hs}, {2 * hs}}, ih1_rzr);

-        instruction_ref xht_h;
+        auto xw_hr_z = prog.insert_instruction(ins, op::add{}, xw_z, hr_z);
+        auto zt      = prog.insert_instruction(ins, actv_func1, xw_hr_z);
+
+        auto xw_hr_r = prog.insert_instruction(ins, op::add{}, xw_r, hr_r);
+        auto rt      = prog.insert_instruction(ins, actv_func1, xw_hr_r);
+
+        instruction_ref hr_h{};
        if(linear_before_reset == 0)
        {
            // equation g(Xt*(Wh^T) + (rt (.) Ht-1)*(Rh^T) + Rbh + Wbh)
-            auto xt_wh  = prog.insert_instruction(ins, op::dot{}, xt, tran_wh);
            auto rt_ht1 = prog.insert_instruction(ins, op::mul{}, rt, sih);
-            auto rt_rh  = prog.insert_instruction(ins, op::dot{}, rt_ht1, tran_rh);
-            xht_h       = prog.insert_instruction(ins, op::add{}, xt_wh, rt_rh);
+            hr_h        = prog.insert_instruction(ins, op::dot{}, rt_ht1, trh);
            if(bias != prog.end())
            {
-                xht_h = prog.insert_instruction(ins, op::add{}, xht_h, brcst_bh);
+                hr_h = prog.insert_instruction(ins, op::add{}, hr_h, brb_h);
            }
        }
        else
        {
            // equation ht = g(Xt*(Wh^T) + (rt (.) (Ht-1*(Rh^T) + Rbh)) + Wbh)
-            auto xt_wh  = prog.insert_instruction(ins, op::dot{}, xt, tran_wh);
-            auto ht1_rh = prog.insert_instruction(ins, op::dot{}, sih, tran_rh);
-            if(bias != prog.end())
-            {
-                ht1_rh = prog.insert_instruction(ins, op::add{}, ht1_rh, brcst_rbh);
-            }
-            auto rt_rh = prog.insert_instruction(ins, op::mul{}, rt, ht1_rh);
-            xht_h      = prog.insert_instruction(ins, op::add{}, xt_wh, rt_rh);
+            auto ht1_rh = prog.insert_instruction(ins, op::dot{}, sih, trh);
            if(bias != prog.end())
            {
-                xht_h = prog.insert_instruction(ins, op::add{}, xht_h, brcst_wbh);
+                ht1_rh = prog.insert_instruction(ins, op::add{}, ht1_rh, brb_h);
            }
+            hr_h = prog.insert_instruction(ins, op::mul{}, rt, ht1_rh);
        }
-        auto ht = prog.insert_instruction(ins, actv_func2, xht_h);
+
+        auto xw_hr_h = prog.insert_instruction(ins, op::add{}, xw_h, hr_h);
+        auto ht      = prog.insert_instruction(ins, actv_func2, xw_hr_h);

        // equation Ht = (1 - zt) (.) ht + zt (.) Ht-1
        auto one_minus_zt    = prog.insert_instruction(ins, op::sub{}, l1, zt);
@@ -683,7 +667,6 @@ void rewrite_rnn::apply_lstm(program& prog, instruction_ref ins) const
    std::vector<float> ihc_data(ihc_shape.elements(), 0.0);

    migraphx::shape pph_shape{type, {1, 3 * hidden_size}};
-    std::vector<float> pph_data(pph_shape.elements(), 0.0);

    auto actv_funcs         = lstm_actv_funcs(ins);
    auto lstm_op            = any_cast<op::lstm>(ins->get_operator());
@@ -913,35 +896,16 @@ std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
    migraphx::shape r_shape   = r->get_shape();
    long seq_len              = static_cast<long>(seq_shape.lens()[0]);
    long hs                   = static_cast<long>(r_shape.lens()[2]);
+    auto bs                   = ih->get_shape().lens()[1];

    std::vector<int64_t> perm{1, 0};
-    // w matrix
-    auto sw      = prog.insert_instruction(ins, op::squeeze{{0}}, w);
-    auto wi      = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sw);
-    auto tran_wi = prog.insert_instruction(ins, op::transpose{perm}, wi);
-
-    auto wo      = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sw);
-    auto tran_wo = prog.insert_instruction(ins, op::transpose{perm}, wo);
-
-    auto wf      = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sw);
-    auto tran_wf = prog.insert_instruction(ins, op::transpose{perm}, wf);
-
-    auto wc      = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sw);
-    auto tran_wc = prog.insert_instruction(ins, op::transpose{perm}, wc);
+    // w matrix, squeeze and transpose
+    auto sw  = prog.insert_instruction(ins, op::squeeze{{0}}, w);
+    auto tsw = prog.insert_instruction(ins, op::transpose{perm}, sw);

-    // r matrix
-    auto sr      = prog.insert_instruction(ins, op::squeeze{{0}}, r);
-    auto ri      = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sr);
-    auto tran_ri = prog.insert_instruction(ins, op::transpose{perm}, ri);
-
-    auto ro      = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sr);
-    auto tran_ro = prog.insert_instruction(ins, op::transpose{perm}, ro);
-
-    auto rf      = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sr);
-    auto tran_rf = prog.insert_instruction(ins, op::transpose{perm}, rf);
-
-    auto rc      = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sr);
-    auto tran_rc = prog.insert_instruction(ins, op::transpose{perm}, rc);
+    // r matrix, squeeze and transpose
+    auto sr  = prog.insert_instruction(ins, op::squeeze{{0}}, r);
+    auto tsr = prog.insert_instruction(ins, op::transpose{perm}, sr);

    // initial hidden state
    auto sih = prog.insert_instruction(ins, op::squeeze{{0}}, ih);
@@ -951,40 +915,23 @@ std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
    auto ic_lens = sic->get_shape().lens();

    // bias
-    instruction_ref bi_brcst{};
-    instruction_ref bo_brcst{};
-    instruction_ref bf_brcst{};
-    instruction_ref bc_brcst{};
+    instruction_ref wrb{};
    if(bias != prog.end())
    {

-        auto sbias = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
-        auto bxi   = prog.insert_instruction(ins, op::slice{{0}, {0}, {hs}}, sbias);
-        auto bhi   = prog.insert_instruction(ins, op::slice{{0}, {4 * hs}, {5 * hs}}, sbias);
-        auto bi    = prog.insert_instruction(ins, op::add{}, bxi, bhi);
-        bi_brcst   = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bi);
-
-        auto bxo = prog.insert_instruction(ins, op::slice{{0}, {hs}, {2 * hs}}, sbias);
-        auto bho = prog.insert_instruction(ins, op::slice{{0}, {5 * hs}, {6 * hs}}, sbias);
-        auto bo  = prog.insert_instruction(ins, op::add{}, bxo, bho);
-        bo_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bo);
-
-        auto bxf = prog.insert_instruction(ins, op::slice{{0}, {2 * hs}, {3 * hs}}, sbias);
-        auto bhf = prog.insert_instruction(ins, op::slice{{0}, {6 * hs}, {7 * hs}}, sbias);
-        auto bf  = prog.insert_instruction(ins, op::add{}, bxf, bhf);
-        bf_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bf);
-
-        auto bxc = prog.insert_instruction(ins, op::slice{{0}, {3 * hs}, {4 * hs}}, sbias);
-        auto bhc = prog.insert_instruction(ins, op::slice{{0}, {7 * hs}, {8 * hs}}, sbias);
-        auto bc  = prog.insert_instruction(ins, op::add{}, bxc, bhc);
-        bc_brcst = prog.insert_instruction(ins, op::broadcast{1, ic_lens}, bc);
+        auto sbias  = prog.insert_instruction(ins, op::squeeze{{0}}, bias);
+        auto ub_wb  = prog.insert_instruction(ins, op::slice{{0}, {0}, {4 * hs}}, sbias);
+        auto ub_rb  = prog.insert_instruction(ins, op::slice{{0}, {4 * hs}, {8 * hs}}, sbias);
+        auto ub_wrb = prog.insert_instruction(ins, op::add{}, ub_wb, ub_rb);
+
+        wrb = prog.insert_instruction(
+            ins, op::broadcast{1, {bs, 4 * static_cast<size_t>(hs)}}, ub_wrb);
    }

    // peep hole
    instruction_ref pphi_brcst{};
    instruction_ref ppho_brcst{};
    instruction_ref pphf_brcst{};
-
    if(pph != prog.end())
    {
        auto spph  = prog.insert_instruction(ins, op::squeeze{{0}}, pph);
@@ -1004,44 +951,31 @@ std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
        auto xt = prog.insert_instruction(ins, op::slice{{0}, {seq_index}, {seq_index + 1}}, seq);
        xt      = prog.insert_instruction(ins, op::squeeze{{0}}, xt);

-        // equation it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi)
-        auto xt_wi          = prog.insert_instruction(ins, op::dot{}, xt, tran_wi);
-        auto ht_ri          = prog.insert_instruction(ins, op::dot{}, sih, tran_ri);
-        auto it_before_actv = prog.insert_instruction(ins, op::add{}, xt_wi, ht_ri);
-        if(pph != prog.end())
-        {
-            auto pphi_ct   = prog.insert_instruction(ins, op::mul{}, pphi_brcst, sic);
-            it_before_actv = prog.insert_instruction(ins, op::add{}, it_before_actv, pphi_ct);
-        }
+        auto xt_tsw  = prog.insert_instruction(ins, op::dot{}, xt, tsw);
+        auto sih_tsr = prog.insert_instruction(ins, op::dot{}, sih, tsr);
+        auto xt_sih  = prog.insert_instruction(ins, op::add{}, xt_tsw, sih_tsr);
        if(bias != prog.end())
        {
-            it_before_actv = prog.insert_instruction(ins, op::add{}, it_before_actv, bi_brcst);
+            xt_sih = prog.insert_instruction(ins, op::add{}, xt_sih, wrb);
        }
-        auto it = prog.insert_instruction(ins, actv_func1, it_before_actv);

-        // equation ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf)
-        auto xt_wf          = prog.insert_instruction(ins, op::dot{}, xt, tran_wf);
-        auto ht_rf          = prog.insert_instruction(ins, op::dot{}, sih, tran_rf);
-        auto ft_before_actv = prog.insert_instruction(ins, op::add{}, xt_wf, ht_rf);
+        auto it_before_actv = prog.insert_instruction(ins, op::slice{{1}, {0}, {hs}}, xt_sih);
+        auto ot_before_actv = prog.insert_instruction(ins, op::slice{{1}, {hs}, {2 * hs}}, xt_sih);
+        auto ft_before_actv =
+            prog.insert_instruction(ins, op::slice{{1}, {2 * hs}, {3 * hs}}, xt_sih);
+        auto ct_before_actv =
+            prog.insert_instruction(ins, op::slice{{1}, {3 * hs}, {4 * hs}}, xt_sih);
+
        if(pph != prog.end())
        {
+            auto pphi_ct   = prog.insert_instruction(ins, op::mul{}, pphi_brcst, sic);
+            it_before_actv = prog.insert_instruction(ins, op::add{}, it_before_actv, pphi_ct);
+
            auto pphf_ct   = prog.insert_instruction(ins, op::mul{}, pphf_brcst, sic);
            ft_before_actv = prog.insert_instruction(ins, op::add{}, ft_before_actv, pphf_ct);
        }
-        if(bias != prog.end())
-        {
-            ft_before_actv = prog.insert_instruction(ins, op::add{}, ft_before_actv, bf_brcst);
-        }
+        auto it = prog.insert_instruction(ins, actv_func1, it_before_actv);
        auto ft = prog.insert_instruction(ins, actv_func1, ft_before_actv);
-
-        // equation ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc)
-        auto xt_wc          = prog.insert_instruction(ins, op::dot{}, xt, tran_wc);
-        auto ht_rc          = prog.insert_instruction(ins, op::dot{}, sih, tran_rc);
-        auto ct_before_actv = prog.insert_instruction(ins, op::add{}, xt_wc, ht_rc);
-        if(bias != prog.end())
-        {
-            ct_before_actv = prog.insert_instruction(ins, op::add{}, ct_before_actv, bc_brcst);
-        }
        auto ct = prog.insert_instruction(ins, actv_func2, ct_before_actv);

        // equation Ct = ft (.) Ct-1 + it (.) ct
@@ -1050,19 +984,11 @@ std::vector<instruction_ref> rewrite_rnn::lstm_cell(bool is_forward,
        auto cellt       = prog.insert_instruction(ins, op::add{}, ft_cell, it_ct);
        last_cell_output = cellt;

-        // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo)
-        auto xt_wo          = prog.insert_instruction(ins, op::dot{}, xt, tran_wo);
-        auto ht_ro          = prog.insert_instruction(ins, op::dot{}, sih, tran_ro);
-        auto ot_before_actv = prog.insert_instruction(ins, op::add{}, xt_wo, ht_ro);
        if(pph != prog.end())
        {
            auto ppho_cellt = prog.insert_instruction(ins, op::mul{}, ppho_brcst, cellt);
            ot_before_actv  = prog.insert_instruction(ins, op::add{}, ot_before_actv, ppho_cellt);
        }
-        if(bias != prog.end())
-        {
-            ot_before_actv = prog.insert_instruction(ins, op::add{}, ot_before_actv, bo_brcst);
-        }
        auto ot = prog.insert_instruction(ins, actv_func1, ot_before_actv);

        // Ht = ot (.) h(Ct)

--- a/src/shape.cpp
+++ b/src/shape.cpp
@@ -138,6 +138,24 @@ std::size_t shape::index(std::size_t i) const
        return result;
    }
 }
+
+std::vector<std::size_t> shape::multi(std::size_t i) const
+{
+    assert(this->standard());
+
+    std::vector<std::size_t> indices(lens().size());
+    std::transform(strides().begin(),
+                   strides().end(),
+                   lens().begin(),
+                   indices.begin(),
+                   [&](std::size_t stride, std::size_t len) {
+                       assert(len > 0 and stride > 0);
+                       return (i / stride) % len;
+                   });
+
+    return indices;
+}
+
 bool shape::packed() const { return this->elements() == this->element_space(); }

 bool shape::transposed() const

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
 #include <migraphx/simplify_algebra.hpp>
+#include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/op/add.hpp>
+#include <migraphx/op/mul.hpp>
+#include <migraphx/op/broadcast.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/literal.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-struct find_add_lit_broadcast
+auto lit_broadcast() { return match::any_of(match::is_constant(), match::name("broadcast")); }
+auto not_lit_broadcast() { return match::none_of(match::is_constant(), match::name("broadcast")); }
+auto op_lit_broadcast(std::string op, std::string x, std::string y)
+{
+    return match::name(std::move(op))(match::either_arg(0, 1)(
+        lit_broadcast().bind(std::move(x)), not_lit_broadcast().bind(std::move(y))));
+}
+
+auto conv_const_weights()
+{
+    return match::name("convolution")(match::used_once(),
+                                      match::args(match::any(), match::is_constant().bind("w")));
+}
+
+struct find_mul_conv
 {
-    auto lit_broadcast() const
+    auto matcher() const
    {
-        return match::any_of(match::name("@literal"), match::name("broadcast"));
+        return match::name("mul")(match::either_arg(0, 1)(conv_const_weights().bind("conv"),
+                                                          match::name("broadcast").bind("a")));
    }
-    auto not_lit_broadcast() const
+
+    void apply(program& p, match::matcher_result r) const
    {
-        return match::none_of(match::name("@literal"), match::name("broadcast"));
+        auto ins      = r.result;
+        auto conv_ins = r.instructions["conv"];
+        auto a_ins    = r.instructions["a"];
+        auto w_ins    = r.instructions["w"];
+
+        auto broadcast_op = any_cast<op::broadcast>(a_ins->get_operator());
+        if(broadcast_op.axis != 1)
+            return;
+
+        auto new_a = p.insert_instruction(
+            ins, op::broadcast{0, w_ins->get_shape().lens()}, a_ins->inputs().front());
+        auto new_mul  = p.insert_instruction(ins, op::mul{}, new_a, w_ins);
+        auto new_conv = p.insert_instruction(
+            ins, conv_ins->get_operator(), conv_ins->inputs().front(), new_mul);
+        p.replace_instruction(ins, new_conv);
    }
-    auto add_lit_broadcast(std::string x, std::string y) const
+};
+
+// a * (x + b) => a * x + a * b
+struct find_mul_add
+{
+    auto matcher() const
    {
-        return match::name("add")(match::either_arg(0, 1)(lit_broadcast().bind(std::move(x)),
-                                                          not_lit_broadcast().bind(std::move(y))));
+        return match::name("mul")(match::either_arg(0, 1)(
+            match::name("add")(
+                match::either_arg(0, 1)(
+                    match::any().bind("x"),
+                    match::any_of(conv_const_weights(), match::is_constant()).bind("b")),
+                match::none_of(match::args(match::is_constant(), match::is_constant())),
+                match::used_once()),
+            match::is_constant().bind("a")));
    }
+
+    void apply(program& p, match::matcher_result r) const
+    {
+        auto ins   = r.result;
+        auto a_ins = r.instructions["a"];
+        auto b_ins = r.instructions["b"];
+        auto x_ins = r.instructions["x"];
+        assert(x_ins != b_ins);
+
+        auto ax_ins = p.insert_instruction(ins, op::mul{}, a_ins, x_ins);
+        auto ab_ins = p.insert_instruction(ins, op::mul{}, a_ins, b_ins);
+        p.replace_instruction(ins, op::add{}, ax_ins, ab_ins);
+    }
+};
+
+struct find_add_lit_broadcast
+{
    auto matcher() const
    {
        return match::name("add")(
-            match::args(add_lit_broadcast("a", "x"), add_lit_broadcast("b", "y")));
+            match::either_arg(0, 1)(op_lit_broadcast("add", "a", "x"), lit_broadcast().bind("b")));
+    }
+
+    void apply(program& p, match::matcher_result r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["x"];
+        auto a_ins = r.instructions["a"];
+        auto b_ins = r.instructions["b"];
+
+        auto sumab = p.insert_instruction(ins, op::add{}, a_ins, b_ins);
+        p.replace_instruction(ins, op::add{}, x_ins, sumab);
+    }
+};
+
+struct find_double_add_lit_broadcast
+{
+    auto matcher() const
+    {
+        return match::name("add")(
+            match::args(op_lit_broadcast("add", "a", "x"), op_lit_broadcast("add", "b", "y")));
    }

    void apply(program& p, match::matcher_result r) const
@@ -36,11 +117,9 @@ struct find_add_lit_broadcast
        auto a_ins = r.instructions["a"];
        auto b_ins = r.instructions["b"];

-        if(a_ins->name() != b_ins->name())
-            return;
        instruction_ref sumab;

-        if(a_ins->name() == "broadcast")
+        if(a_ins->name() == "broadcast" and b_ins->name() == "broadcast")
        {
            if(a_ins->inputs().at(0)->get_shape() != b_ins->inputs().at(0)->get_shape())
                return;
@@ -59,7 +138,46 @@ struct find_add_lit_broadcast
    }
 };

-void simplify_algebra::apply(program& p) const { match::find_matches(p, find_add_lit_broadcast{}); }
+struct find_inner_broadcast
+{
+    auto matcher() const
+    {
+        return match::name("mul", "add")(
+            match::args(match::name("broadcast").bind("x"), match::name("broadcast").bind("y")));
+    }
+
+    void apply(program& p, match::matcher_result r) const
+    {
+        auto ins   = r.result;
+        auto x_ins = r.instructions["x"];
+        auto y_ins = r.instructions["y"];
+
+        auto xbroadcast = any_cast<op::broadcast>(x_ins->get_operator());
+        auto ybroadcast = any_cast<op::broadcast>(y_ins->get_operator());
+
+        if(xbroadcast.axis != ybroadcast.axis)
+            return;
+
+        auto op = p.insert_instruction(
+            ins, ins->get_operator(), x_ins->inputs().front(), y_ins->inputs().front());
+        p.replace_instruction(ins, xbroadcast, op);
+    }
+};
+
+void simplify_algebra::apply(program& p) const
+{
+    // Run simplifications multiple times
+    for(int i = 0; i < 4; i++)
+    {
+        match::find_matches(p,
+                            find_inner_broadcast{},
+                            find_double_add_lit_broadcast{},
+                            find_add_lit_broadcast{},
+                            find_mul_conv{},
+                            find_mul_add{});
+        dead_code_elimination{}.apply(p);
+    }
+}

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -2,14 +2,17 @@
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/op/as_shape.hpp>
+#include <migraphx/op/transpose.hpp>
+#include <migraphx/op/concat.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/ranges.hpp>
+#include <migraphx/matcher.hpp>
 #include <unordered_set>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-bool is_reshaper(instruction_ref ins)
+const auto& reshaper_names()
 {
    // clang-format off
    static const std::unordered_set<std::string> names = {
@@ -19,17 +22,10 @@ bool is_reshaper(instruction_ref ins)
        "unsqueeze"
    };
    // clang-format on
-    return contains(names, ins->name());
+    return names;
 }

-bool is_transpose_output(instruction_ref ins)
-{
-    if(ins->outputs().size() != 1)
-        return false;
-    if(ins->outputs().front()->name() == "contiguous")
-        return is_transpose_output(ins->outputs().front());
-    return ins->outputs().front()->name() == "transpose";
-}
+bool is_reshaper(instruction_ref ins) { return contains(reshaper_names(), ins->name()); }

 instruction_ref find_transpose_input(instruction_ref ins)
 {
@@ -42,62 +38,189 @@ instruction_ref find_transpose_input(instruction_ref ins)
    return ins;
 }

-void simplify_reshapes::apply(program& p) const
+auto get_transpose_dims(instruction_ref ins)
 {
-    auto end = std::prev(p.end());
-    for(auto ins : iterator_for(p))
+    return any_cast<const op::transpose&>(ins->get_operator()).dims;
+}
+
+std::vector<int64_t> reorder_dims(std::vector<int64_t> dims, std::vector<int64_t> permutation)
+{
+    std::vector<int64_t> result(dims.size());
+    assert(dims.size() == permutation.size());
+    for(std::size_t i = 0; i < dims.size(); i++)
    {
-        if(ins == end and ins->name() == "contiguous")
-            continue;
-        // Skip possible dead instructions
-        if(ins->outputs().empty() and ins != end)
-            continue;
-        if(is_reshaper(ins))
+        result[i] = dims[permutation[i]];
+    }
+    return result;
+}
+
+bool is_no_transpose(const std::vector<int64_t>& dims)
+{
+    if(dims.empty())
+        return true;
+    if(dims.front() != 0)
+        return false;
+    return std::adjacent_find(
+               dims.begin(), dims.end(), [](auto x, auto y) { return (y - x) != 1; }) == dims.end();
+}
+
+template <class Vector, class Op>
+std::vector<int64_t> sort_permutation(const Vector& data, Op op)
+{
+    std::vector<std::int64_t> result(data.size());
+    std::iota(result.begin(), result.end(), 0);
+    std::sort(result.begin(), result.end(), [&](auto x, auto y) { return op(data[x], data[y]); });
+    return result;
+}
+
+std::vector<int64_t> invert_permutation(const std::vector<int64_t>& permutation)
+{
+    return sort_permutation(permutation, std::less<>{});
+}
+
+std::vector<int64_t> find_permutation(const shape& s)
+{
+    return sort_permutation(s.strides(), std::greater<>{});
+}
+
+struct find_reshaper
+{
+    auto matcher() const
+    {
+        return match::name(reshaper_names())(
+            match::any_of[match::outputs()](match::name(reshaper_names())));
+    }
+
+    void apply(program& p, const match::matcher_result& mr) const
+    {
+        auto ins = mr.result;
+        std::vector<instruction_ref> reshapes{ins};
+        while(is_reshaper(reshapes.back()))
        {
-            if(std::any_of(ins->outputs().begin(), ins->outputs().end(), &is_reshaper))
-                continue;
-            // Gather reshapes
-            std::vector<instruction_ref> reshapes{ins};
-            while(is_reshaper(reshapes.back()))
-            {
-                assert(!reshapes.back()->inputs().empty());
-                assert(p.has_instruction(reshapes.back()->inputs().front()));
-                auto input = reshapes.back()->inputs().front();
-                reshapes.push_back(input);
-            }
+            assert(!reshapes.back()->inputs().empty());
+            assert(p.has_instruction(reshapes.back()->inputs().front()));
+            auto input = reshapes.back()->inputs().front();
+            reshapes.push_back(input);
+        }

-            std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
-            for(auto start : iterator_for(reshapes))
-            {
-                auto last = std::find_if(reshapes.rbegin(), reshapes.rend(), [&](auto&& i) {
-                    return i->get_shape() == (*start)->get_shape() and i != (*start);
-                });
-                if(last != reshapes.rend())
-                {
-                    r = std::make_pair(*start, *last);
-                    break;
-                }
-            }
-            if(r.first != r.second)
+        std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
+        for(auto start : iterator_for(reshapes))
+        {
+            auto last = std::find_if(reshapes.rbegin(), reshapes.rend(), [&](auto&& i) {
+                return i->get_shape() == (*start)->get_shape() and i != (*start);
+            });
+            if(last != reshapes.rend())
            {
-                p.replace_instruction(r.first, r.second);
+                r = std::make_pair(*start, *last);
+                break;
            }
        }
-        else if(ins->name() == "transpose")
+        if(r.first != r.second)
+        {
+            p.replace_instruction(r.first, r.second);
+        }
+    }
+};
+
+struct find_nop_reshapes
+{
+    auto matcher() const
+    {
+        auto reshapes = reshaper_names();
+        reshapes.insert("transpose");
+        reshapes.insert("slice");
+        return match::name(reshapes)(match::same_shape(match::arg(0)));
+    }
+
+    void apply(program& p, const match::matcher_result& mr) const
+    {
+        auto ins = mr.result;
+        p.replace_instruction(ins, ins->inputs().front());
+    }
+};
+
+struct find_transpose
+{
+    auto matcher() const
+    {
+        return match::name("transpose")(match::none_of(
+            match::skip_output(match::name("contiguous"))(match::name("transpose"))));
+    }
+
+    void apply(program& p, const match::matcher_result& mr) const
+    {
+        auto ins = mr.result;
+        auto x   = ins;
+        auto t   = ins;
+        std::vector<std::int64_t> dims(ins->get_shape().lens().size());
+        std::iota(dims.begin(), dims.end(), 0);
+        do
+        {
+            dims = reorder_dims(get_transpose_dims(t), dims);
+            x    = t;
+            t    = find_transpose_input(x);
+        } while(x != t and t->name() == "transpose");
+        if(t == ins or t->name() != "transpose")
+            return;
+        if(is_no_transpose(dims))
        {
-            if(is_transpose_output(ins))
-                continue;
-            auto x = ins;
-            auto t = ins;
-            do
-            {
-                x = t;
-                t = find_transpose_input(x);
-            } while(x != t and t->name() == "transpose");
-            if(t == ins or t->name() != "transpose")
-                continue;
            p.replace_instruction(ins, t->inputs().front());
        }
+        else
+        {
+            p.replace_instruction(ins, op::transpose{{dims}}, t->inputs().front());
+        }
+    }
+};
+
+struct find_concat_transpose
+{
+    auto matcher() const
+    {
+        return match::name("concat")(match::same_input_shapes(),
+                                     match::all_of[match::inputs()](match::transpose_shape()));
+    }
+
+    void apply(program& p, const match::matcher_result& mr) const
+    {
+        auto ins = mr.result;
+        auto s   = ins->inputs().front()->get_shape();
+        assert(s.transposed());
+        auto op           = any_cast<op::concat>(ins->get_operator());
+        auto permutation  = find_permutation(s);
+        auto ipermutation = invert_permutation(permutation);
+        op.axis           = ipermutation[op.axis];
+
+        std::vector<instruction_ref> inputs;
+        std::transform(
+            ins->inputs().begin(), ins->inputs().end(), std::back_inserter(inputs), [&](auto i) {
+                if(i->name() == "transpose" and i->inputs().front()->get_shape().standard())
+                    return i->inputs().front();
+                return p.insert_instruction(ins, op::transpose{permutation}, i);
+            });
+        auto concat = p.insert_instruction(ins, op, inputs);
+        auto t      = p.insert_instruction(ins, op::transpose{ipermutation}, concat);
+        assert(ins->get_shape().lens() == t->get_shape().lens());
+        p.replace_instruction(ins, t);
+    }
+};
+
+void simplify_reshapes::apply(program& p) const
+{
+    auto end = std::prev(p.end());
+    for(auto ins : iterator_for(p))
+    {
+        if(ins == end and ins->name() == "contiguous")
+            continue;
+        // Skip possible dead instructions
+        if(ins->outputs().empty() and ins != end)
+            continue;
+        match::find_matches(p,
+                            ins,
+                            find_nop_reshapes{},
+                            find_reshaper{},
+                            find_transpose{},
+                            find_concat_transpose{});
    }
 }


--- a/src/targets/cpu/CMakeLists.txt
+++ b/src/targets/cpu/CMakeLists.txt
@@ -5,6 +5,7 @@ add_library(migraphx_cpu
    gemm.cpp
 )
 set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu)
+rocm_set_soversion(migraphx_cpu ${PROJECT_VERSION})

 find_path(BLAZE_INCLUDE blaze/Blaze.h)
 find_package(Threads)