Merge branch 'develop' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_check_shapes

5d236dfc · charlie · 42601741 · bd503d89 · 5d236dfc · 5d236dfc
Commit 5d236dfc authored Jul 07, 2022 by charlie
20 changed files
--- a/src/include/migraphx/module_ref.hpp
+++ b/src/include/migraphx/module_ref.hpp
@@ -32,7 +32,8 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

 struct module;
-using module_ref = module*;
+using module_ref       = module*;
+using const_module_ref = const module*;

 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/include/migraphx/op/nonmaxsuppression.hpp
+++ b/src/include/migraphx/op/nonmaxsuppression.hpp
@@ -56,14 +56,21 @@ struct nonmaxsuppression
    shape compute_shape(std::vector<shape> inputs) const
    {
        // requires at least 2 inputs
-        check_shapes{inputs, *this}.standard();
        check_shapes{{inputs.at(0), inputs.at(1)}, *this}.only_dims(3);
        auto lens = inputs.front().lens();

        // check input shape
        if(lens[1] != inputs.at(1).lens()[2])
        {
-            MIGRAPHX_THROW("NonMaxSuppression: dimension mismatch between first and second input!");
+            MIGRAPHX_THROW(
+                "NonMaxSuppression: spatial dimension mismatch between boxes and scores input");
+        }
+
+        // check batch sizes
+        if(lens[0] != inputs.at(1).lens()[0])
+        {
+            MIGRAPHX_THROW(
+                "NonMaxSuppression: number of batches mismatch between boxes and scores input");
        }

        std::vector<int64_t> out_lens(2);
@@ -74,8 +81,8 @@ struct nonmaxsuppression

    struct box
    {
-        std::array<float, 2> x;
-        std::array<float, 2> y;
+        std::array<double, 2> x;
+        std::array<double, 2> y;

        void sort()
        {
@@ -83,9 +90,9 @@ struct nonmaxsuppression
            std::sort(y.begin(), y.end());
        }

-        std::array<float, 2>& operator[](std::size_t i) { return i == 0 ? x : y; }
+        std::array<double, 2>& operator[](std::size_t i) { return i == 0 ? x : y; }

-        float area() const
+        double area() const
        {
            assert(std::is_sorted(x.begin(), x.end()));
            assert(std::is_sorted(y.begin(), y.end()));
@@ -94,29 +101,29 @@ struct nonmaxsuppression
    };

    template <class T>
-    box batch_box(const T* boxes, std::size_t bidx) const
+    box batch_box(T boxes, std::size_t box_idx) const
    {
        box result{};
-        const T* start = boxes + 4 * bidx;
+        auto start = boxes + 4 * box_idx;
        if(center_point_box)
        {
-            float half_width  = start[2] / 2.0f;
-            float half_height = start[3] / 2.0f;
-            float x_center    = start[0];
-            float y_center    = start[1];
-            result.x          = {x_center - half_width, x_center + half_width};
-            result.y          = {y_center - half_height, y_center + half_height};
+            double half_width  = start[2] / 2.0;
+            double half_height = start[3] / 2.0;
+            double x_center    = start[0];
+            double y_center    = start[1];
+            result.x           = {x_center - half_width, x_center + half_width};
+            result.y           = {y_center - half_height, y_center + half_height};
        }
        else
        {
-            result.x = {start[1], start[3]};
-            result.y = {start[0], start[2]};
+            result.x = {static_cast<double>(start[1]), static_cast<double>(start[3])};
+            result.y = {static_cast<double>(start[0]), static_cast<double>(start[2])};
        }

        return result;
    }

-    inline bool suppress_by_iou(box b1, box b2, float iou_threshold) const
+    inline bool suppress_by_iou(box b1, box b2, double iou_threshold) const
    {
        b1.sort();
        b2.sort();
@@ -128,7 +135,7 @@ struct nonmaxsuppression
            intersection[i][1] = std::min(b1[i][1], b2[i][1]);
        }

-        std::vector<std::array<float, 2>> bbox = {intersection.x, intersection.y};
+        std::vector<std::array<double, 2>> bbox = {intersection.x, intersection.y};
        if(std::any_of(bbox.begin(), bbox.end(), [](auto bx) {
               return not std::is_sorted(bx.begin(), bx.end());
           }))
@@ -136,115 +143,124 @@ struct nonmaxsuppression
            return false;
        }

-        const float area1             = b1.area();
-        const float area2             = b2.area();
-        const float intersection_area = intersection.area();
-        const float union_area        = area1 + area2 - intersection_area;
+        const double area1             = b1.area();
+        const double area2             = b2.area();
+        const double intersection_area = intersection.area();
+        const double union_area        = area1 + area2 - intersection_area;

        if(area1 <= .0f or area2 <= .0f or union_area <= .0f)
        {
            return false;
        }

-        const float intersection_over_union = intersection_area / union_area;
+        const double intersection_over_union = intersection_area / union_area;

        return intersection_over_union > iou_threshold;
    }

-    argument compute(const shape& output_shape, std::vector<argument> args) const
+    // filter boxes below score_threshold
+    template <class T>
+    std::priority_queue<std::pair<double, int64_t>>
+    filter_boxes_by_score(T scores_start, std::size_t num_boxes, double score_threshold) const
    {
-        argument result{output_shape};
-
-        result.visit([&](auto out) { std::fill(out.begin(), out.end(), 0); });
-
-        std::size_t max_output_boxes_per_class = 0;
-        float iou_threshold                    = 0.0f;
-        float score_threshold                  = 0.0f;
-
-        if(args.size() > 2)
-        {
-            max_output_boxes_per_class = args.at(2).at<std::size_t>();
-        }
-        // max_output_boxes_per_class is 0, no output
-        if(max_output_boxes_per_class == 0)
-        {
-            return result;
-        }
-
-        if(args.size() > 3)
-        {
-            iou_threshold = args.at(3).at<float>();
-        }
-
-        if(args.size() > 4)
-        {
-            score_threshold = args.at(4).at<float>();
-        }
-
-        const auto& lens = args.at(1).get_shape().lens();
-        auto batch_num   = lens[0];
-        auto class_num   = lens[1];
-        auto box_num     = args.at(0).get_shape().lens()[1];
+        std::priority_queue<std::pair<double, int64_t>> boxes_heap;
+        auto insert_to_boxes_heap =
+            make_function_output_iterator([&](const auto& x) { boxes_heap.push(x); });
+        int64_t box_idx = 0;
+        transform_if(
+            scores_start,
+            scores_start + num_boxes,
+            insert_to_boxes_heap,
+            [&](auto sc) {
+                box_idx++;
+                return sc >= score_threshold;
+            },
+            [&](auto sc) { return std::make_pair(sc, box_idx - 1); });
+        return boxes_heap;
+    }

-        std::vector<std::pair<float, int64_t>> selected_boxes_inside_class;
+    template <class Output, class Boxes, class Scores>
+    void compute_nms(Output output,
+                     Boxes boxes,
+                     Scores scores,
+                     const shape& output_shape,
+                     std::size_t max_output_boxes_per_class,
+                     double iou_threshold,
+                     double score_threshold) const
+    {
+        std::fill(output.begin(), output.end(), 0);
+        const auto& lens       = scores.get_shape().lens();
+        const auto num_batches = lens[0];
+        const auto num_classes = lens[1];
+        const auto num_boxes   = lens[2];
+        // boxes of a class with NMS applied [score, index]
+        std::vector<std::pair<double, int64_t>> selected_boxes_inside_class;
        std::vector<int64_t> selected_indices;
        selected_boxes_inside_class.reserve(output_shape.elements());
-
-        auto scores        = make_view<float>(args.at(1).get_shape(), args.at(1).cast<float>());
-        const float* boxes = args.at(0).cast<float>();
-        shape comp_s{shape::float_type, {batch_num, class_num}};
+        // iterate over batches and classes
+        shape comp_s{shape::double_type, {num_batches, num_classes}};
        shape_for_each(comp_s, [&](auto idx) {
-            auto bidx = idx[0];
-            auto cidx = idx[1];
-
-            std::size_t score_offset = (bidx * class_num + cidx) * box_num;
-            const float* batch_boxes = boxes + bidx * box_num * 4;
-            std::priority_queue<std::pair<float, int64_t>> sorted_boxes;
-            auto insert_to_sorted_boxes =
-                make_function_output_iterator([&](const auto& x) { sorted_boxes.push(x); });
-
-            int64_t box_idx = 0;
-            transform_if(
-                scores.begin() + score_offset,
-                scores.begin() + score_offset + box_num,
-                insert_to_sorted_boxes,
-                [&](auto sc) {
-                    box_idx++;
-                    return sc >= score_threshold;
-                },
-                [&](auto sc) { return std::make_pair(sc, box_idx - 1); });
-
+            auto batch_idx = idx[0];
+            auto class_idx = idx[1];
+            // index offset for this class
+            auto scores_start = scores.begin() + (batch_idx * num_classes + class_idx) * num_boxes;
+            // iterator to first value of this batch
+            auto batch_boxes_start = boxes.begin() + batch_idx * num_boxes * 4;
+            auto boxes_heap = filter_boxes_by_score(scores_start, num_boxes, score_threshold);
            selected_boxes_inside_class.clear();
            // Get the next box with top score, filter by iou_threshold
-            while(!sorted_boxes.empty() &&
+            while(!boxes_heap.empty() &&
                  selected_boxes_inside_class.size() < max_output_boxes_per_class)
            {
-                const std::pair<float, int64_t>& next_top_score = sorted_boxes.top();
-
-                // Check with existing selected boxes for this class, suppress if exceed the IOU
-                // (Intersection Over Union) threshold
-                bool not_selected = std::any_of(
-                    selected_boxes_inside_class.begin(),
-                    selected_boxes_inside_class.end(),
-                    [&](auto selected_index) {
-                        return this->suppress_by_iou(batch_box(batch_boxes, next_top_score.second),
-                                                     batch_box(batch_boxes, selected_index.second),
-                                                     iou_threshold);
-                    });
+                // Check with existing selected boxes for this class, remove box if it
+                // exceeds the IOU (Intersection Over Union) threshold
+                const auto next_top_score = boxes_heap.top();
+                bool not_selected =
+                    std::any_of(selected_boxes_inside_class.begin(),
+                                selected_boxes_inside_class.end(),
+                                [&](auto selected_index) {
+                                    return this->suppress_by_iou(
+                                        batch_box(batch_boxes_start, next_top_score.second),
+                                        batch_box(batch_boxes_start, selected_index.second),
+                                        iou_threshold);
+                                });

                if(not not_selected)
                {
                    selected_boxes_inside_class.push_back(next_top_score);
-                    selected_indices.push_back(bidx);
-                    selected_indices.push_back(cidx);
+                    selected_indices.push_back(batch_idx);
+                    selected_indices.push_back(class_idx);
                    selected_indices.push_back(next_top_score.second);
                }
-                sorted_boxes.pop();
+                boxes_heap.pop();
            }
        });
+        std::copy(selected_indices.begin(), selected_indices.end(), output.begin());
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};

-        result.visit([&](auto out) {
-            std::copy(selected_indices.begin(), selected_indices.end(), out.begin());
+        std::size_t max_output_boxes_per_class =
+            (args.size() > 2) ? (args.at(2).at<std::size_t>()) : 0;
+        if(max_output_boxes_per_class == 0)
+        {
+            return result;
+        }
+        double iou_threshold   = (args.size() > 3) ? (args.at(3).at<double>()) : 0.0f;
+        double score_threshold = (args.size() > 4) ? (args.at(4).at<double>()) : 0.0f;
+
+        result.visit([&](auto output) {
+            visit_all(args[0], args[1])([&](auto boxes, auto scores) {
+                compute_nms(output,
+                            boxes,
+                            scores,
+                            output_shape,
+                            max_output_boxes_per_class,
+                            iou_threshold,
+                            score_threshold);
+            });
        });

        return result;

--- a/src/include/migraphx/op/unsqueeze.hpp
+++ b/src/include/migraphx/op/unsqueeze.hpp
@@ -42,11 +42,12 @@ namespace op {
 struct unsqueeze
 {
    std::vector<int64_t> axes;
+    std::vector<int64_t> steps;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.axes, "axes"));
+        return pack(f(self.axes, "axes"), f(self.steps, "steps"));
    }

    value attributes() const
@@ -73,6 +74,9 @@ struct unsqueeze
                MIGRAPHX_THROW("UNSQUEEZE: Input must be a scalar");
        }

+        if(steps.size() > axes.size())
+            MIGRAPHX_THROW("UNSQUEEZE: Steps provided with no axis");
+
        std::size_t new_size = old_lens.size() + axes.size();

        std::vector<std::size_t> new_lens(new_size);
@@ -80,16 +84,27 @@ struct unsqueeze
        std::size_t p = 0;
        for(auto i : range(new_size))
        {
-            if(std::find(axes.begin(), axes.end(), i) != axes.end())
+            auto axis_idx = std::find(axes.begin(), axes.end(), i) - axes.begin();
+            if(axis_idx < axes.size())
            {
-                new_lens[i] = 1;
-                if(p == 0) // unsqueeze on the first axes
+                std::int64_t step = 1;
+                if(axis_idx < steps.size())
+                    step = steps[axis_idx];
+                if(step == 0)
+                    MIGRAPHX_THROW("UNSQUEEZE: step must be non-zero");
+                new_lens[i] = step;
+                if(p < old_strides.size())
                {
-                    new_strides[i] = old_lens[0] * old_strides[0];
+                    if((old_lens[p] % step) != 0)
+                        MIGRAPHX_THROW("UNSQUEEZE: Axis dimenstion is not divisible by step");
+                    old_lens[p] /= step;
+                    new_strides[i] = old_strides[p] * old_lens[p];
                }
-                else // unsqueeze on middle or last axes
+                else
                {
-                    new_strides[i] = (p < old_strides.size()) ? old_strides[p - 1] : 1;
+                    if(step != 1)
+                        MIGRAPHX_THROW("UNSQUEEZE: Step must be 1 for extra axes");
+                    new_strides[i] = 1;
                }
            }
            else

--- a/src/include/migraphx/ranges.hpp
+++ b/src/include/migraphx/ranges.hpp
@@ -198,6 +198,12 @@ void transform(Range&& r, Iterator it, F f)
    std::transform(r.begin(), r.end(), it, f);
 }

+template <class Range1, class Range2, class Iterator, class F>
+void transform(Range1&& r1, Range2&& r2, Iterator it, F f)
+{
+    std::transform(r1.begin(), r1.end(), r2.begin(), it, f);
+}
+
 template <class Range>
 auto reverse(Range& r)
 {

--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
@@ -256,6 +256,10 @@ struct shape

        std::size_t size(std::size_t n = 1) const { return sizeof(type) * n; }

+        auto is_integral() const { return std::is_integral<type>{}; }
+        auto is_signed() const { return std::is_signed<type>{}; }
+        auto is_unsigned() const { return std::is_unsigned<type>{}; }
+
        template <class U>
        type* from(U* buffer, std::size_t n = 0) const
        {

--- a/src/include/migraphx/stringutils.hpp
+++ b/src/include/migraphx/stringutils.hpp
@@ -44,8 +44,8 @@ auto with_char(F f)
    return [=](unsigned char c) -> bool { return f(c); };
 }

-inline std::string
-replace_string(std::string subject, const std::string& search, const std::string& replace)
+inline void
+replace_string_inplace(std::string& subject, const std::string& search, const std::string& replace)
 {
    size_t pos = 0;
    while((pos = subject.find(search, pos)) != std::string::npos)
@@ -53,6 +53,12 @@ replace_string(std::string subject, const std::string& search, const std::string
        subject.replace(pos, search.length(), replace);
        pos += replace.length();
    }
+}
+
+inline std::string
+replace_string(std::string subject, const std::string& search, const std::string& replace)
+{
+    replace_string_inplace(subject, search, replace);
    return subject;
 }


--- a/src/inline_module.cpp
+++ b/src/inline_module.cpp
@@ -35,7 +35,7 @@ static void inline_submodule(module& m, instruction_ref ins, bool cond)
 {
    const auto& mod_inputs = ins->module_inputs();
    module_ref smod        = cond ? mod_inputs.at(0) : mod_inputs.at(1);
-    auto mod_outputs       = m.insert_module_instructions(ins, smod);
+    auto mod_outputs       = m.insert_instructions(ins, smod);

    auto ins_outputs = ins->outputs();
    assert(mod_outputs.size() >= ins_outputs.size());

--- a/src/module.cpp
+++ b/src/module.cpp
@@ -35,6 +35,7 @@
 #include <migraphx/make_op.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/json.hpp>
 #include <iostream>
 #include <sstream>
 #include <algorithm>
@@ -196,6 +197,62 @@ void module::assign(const module& m)
    }
 }

+template <class Range>
+static std::vector<instruction_ref>
+insert_generic_instructions(module& m,
+                            instruction_ref ins,
+                            Range&& instructions,
+                            std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    assert(m.has_instruction(ins) or is_end(ins, m.end()));
+    std::vector<instruction_ref> mod_outputs;
+    instruction_ref last;
+    for(instruction_ref sins : instructions)
+    {
+        last = sins;
+        if(contains(map_ins, sins))
+            continue;
+        instruction_ref copy_ins;
+        if(sins->name() == "@literal")
+        {
+            auto l   = sins->get_literal();
+            copy_ins = m.add_literal(l);
+        }
+        else if(sins->name() == "@param")
+        {
+            auto&& name = any_cast<builtin::param>(sins->get_operator()).parameter;
+            auto s      = sins->get_shape();
+            copy_ins    = m.add_parameter(name, s);
+        }
+        else if(sins->name() == "@outline")
+        {
+            auto s   = sins->get_shape();
+            copy_ins = m.add_outline(s);
+        }
+        else
+        {
+            auto mod_args = sins->module_inputs();
+            auto inputs   = sins->inputs();
+            std::vector<instruction_ref> copy_inputs(inputs.size());
+            std::transform(inputs.begin(), inputs.end(), copy_inputs.begin(), [&](auto i) {
+                return contains(map_ins, i) ? map_ins[i] : i;
+            });
+
+            if(sins->name() == "@return")
+            {
+                mod_outputs = copy_inputs;
+                break;
+            }
+
+            copy_ins = m.insert_instruction(ins, sins->get_operator(), copy_inputs, mod_args);
+        }
+        map_ins[sins] = copy_ins;
+    }
+    if(mod_outputs.empty() and instructions.begin() != instructions.end())
+        mod_outputs = {map_ins.at(last)};
+    return mod_outputs;
+}
+
 instruction_ref module::add_instruction(const operation& op, std::vector<instruction_ref> args)
 {
    return insert_instruction(impl->instructions.end(), op, std::move(args));
@@ -334,61 +391,56 @@ instruction_ref module::move_instructions(instruction_ref src, instruction_ref d
    return src;
 }

-std::vector<instruction_ref> module::insert_module_instructions(
-    instruction_ref ins, module_ref m, std::unordered_map<instruction_ref, instruction_ref> map_ins)
+std::vector<instruction_ref>
+module::add_instructions(const std::vector<instruction_ref>& instructions,
+                         std::unordered_map<instruction_ref, instruction_ref> map_ins)
 {
-    std::vector<instruction_ref> mod_outputs;
-    for(auto sins : iterator_for(*m))
-    {
-        if(contains(map_ins, sins))
-            continue;
-        instruction_ref copy_ins;
-        if(sins->name() == "@literal")
-        {
-            auto l   = sins->get_literal();
-            copy_ins = this->add_literal(l);
-        }
-        else if(sins->name() == "@param")
-        {
-            auto&& name = any_cast<builtin::param>(sins->get_operator()).parameter;
-            auto s      = sins->get_shape();
-            copy_ins    = this->add_parameter(name, s);
-        }
-        else if(sins->name() == "@outline")
-        {
-            auto s   = sins->get_shape();
-            copy_ins = this->add_outline(s);
-        }
-        else
-        {
-            auto mod_args = sins->module_inputs();
-            auto inputs   = sins->inputs();
-            std::vector<instruction_ref> copy_inputs(inputs.size());
-            std::transform(inputs.begin(), inputs.end(), copy_inputs.begin(), [&](auto i) {
-                return contains(map_ins, i) ? map_ins[i] : i;
-            });
+    return this->insert_instructions(this->end(), instructions, std::move(map_ins));
+}

-            if(sins->name() == "@return")
-            {
-                mod_outputs = copy_inputs;
-                break;
-            }
+std::vector<instruction_ref>
+module::add_instructions(const_module_ref m,
+                         std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    return this->insert_instructions(this->end(), m, std::move(map_ins));
+}

-            copy_ins = this->insert_instruction(ins, sins->get_operator(), copy_inputs, mod_args);
-        }
-        map_ins[sins] = copy_ins;
-    }
-    if(mod_outputs.empty())
-        mod_outputs = {map_ins.at(std::prev(m->end()))};
-    return mod_outputs;
+std::vector<instruction_ref>
+module::add_instructions(instruction_ref start,
+                         instruction_ref last,
+                         std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    return this->insert_instructions(this->end(), start, last, std::move(map_ins));
 }

-instruction_ref module::add_literal(literal l)
+std::vector<instruction_ref>
+module::insert_instructions(instruction_ref ins,
+                            const std::vector<instruction_ref>& instructions,
+                            std::unordered_map<instruction_ref, instruction_ref> map_ins)
 {
-    impl->emplace_front(std::move(l));
-    return impl->instructions.begin();
+    return insert_generic_instructions(*this, ins, instructions, std::move(map_ins));
+}
+
+std::vector<instruction_ref>
+module::insert_instructions(instruction_ref ins,
+                            const_module_ref m,
+                            std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    return insert_generic_instructions(*this, ins, iterator_for(*m), std::move(map_ins));
+}
+
+std::vector<instruction_ref>
+module::insert_instructions(instruction_ref ins,
+                            instruction_ref start,
+                            instruction_ref last,
+                            std::unordered_map<instruction_ref, instruction_ref> map_ins)
+{
+    auto r = range(start, last);
+    return insert_generic_instructions(*this, ins, iterator_for(r), std::move(map_ins));
 }

+instruction_ref module::add_literal(literal l) { return insert_literal(begin(), std::move(l)); }
+
 instruction_ref module::add_outline(const shape& s)
 {
    impl->push_front({builtin::outline{s}, s, {}});
@@ -397,10 +449,7 @@ instruction_ref module::add_outline(const shape& s)

 instruction_ref module::add_parameter(std::string name, shape s)
 {
-    assert(get_parameter_shape(name) == shape{});
-    impl->push_front({builtin::param{std::move(name), impl->nparams}, std::move(s), {}});
-    impl->nparams++;
-    return impl->instructions.begin();
+    return insert_parameter(begin(), std::move(name), std::move(s));
 }

 instruction_ref module::add_return(std::vector<instruction_ref> args)
@@ -413,6 +462,20 @@ instruction_ref module::add_return(std::vector<instruction_ref> args)
    return result;
 }

+instruction_ref module::insert_literal(instruction_ref ins, literal l)
+{
+    impl->emplace(ins, std::move(l));
+    return std::prev(ins);
+}
+
+instruction_ref module::insert_parameter(instruction_ref ins, std::string name, shape s)
+{
+    assert(get_parameter_shape(name) == shape{});
+    impl->insert(ins, {builtin::param{std::move(name), impl->nparams}, std::move(s), {}});
+    impl->nparams++;
+    return std::prev(ins);
+}
+
 instruction_ref module::replace_return(std::vector<instruction_ref> args)
 {
    auto last = std::prev(this->end());
@@ -706,44 +769,33 @@ void module::print_graph(std::ostream& os, bool brief) const
    os << "}" << std::endl;
 }

-static std::string cpp_var_name(const std::string& name)
+static std::string to_c_id(const std::string& name, char rep = '_')
 {
-    return "m" + replace_string(name, "@", "x");
+    std::string id = transform_string(name, [&](auto c) {
+        if(with_char(::isalnum)(c) or c == '_')
+            return c;
+        return rep;
+    });
+    while(contains(id, "__"))
+        replace_string_inplace(id, "__", "_");
+    return id;
 }

-static std::string cpp_op_var(const std::string& name, instruction_ref ins)
+static std::string cpp_var_name(const std::string& name)
 {
-    return replace_string(name, "@", ins->name());
+    return to_c_id("x_" + replace_string(name, ":", "_module_"));
 }

-static void print_op_attributes(std::ostream& os, const std::string& name, const operation& op)
+static void print_make_op(std::ostream& os, const operation& op)
 {
-    std::string x = to_string(op);
-    if(contains(x, "["))
+    os << "migraphx::make_op(" << enclose_name(op.name());
+    auto v = op.to_value();
+    if(not v.empty())
    {
-        auto start                 = x.find('[');
-        auto end                   = x.find(']');
-        std::string attribute_text = x.substr(start + 1, end - start - 1);
-        std::vector<std::string> attributes;
-        for(auto&& attribute : split_string(attribute_text, ','))
-        {
-            if(contains(attribute, '='))
-                attributes.push_back(attribute);
-            else
-                attributes.back() += "," + attribute;
-        }
-        for(auto&& attribute : attributes)
-        {
-            auto p     = split_string(attribute, '=');
-            auto key   = p.front();
-            auto value = p.back();
-            if(contains({"bn_mode", "padding_mode"}, key))
-                continue;
-            if(key == "mode")
-                value = enclose_name(trim(value));
-            os << name << "." << key << " = " << value << ";" << std::endl;
-        }
+        os << ", "
+           << "migraphx::from_json_string(" << enclose_name(to_json_string(v)) << ")";
    }
+    os << ")";
 }

 static void print_cpp_shape(std::ostream& os, const migraphx::shape& s)
@@ -756,22 +808,25 @@ static void print_cpp_shape(std::ostream& os, const migraphx::shape& s)
 }

 std::unordered_map<instruction_ref, std::string>
-module::print_cpp(std::ostream& os, std::unordered_map<instruction_ref, std::string> names) const
+module::print_cpp(std::ostream& os,
+                  const std::string& mname,
+                  std::unordered_map<instruction_ref, std::string> names) const
 {
-    os << "migraphx::module p;" << std::endl;
-    unsigned long seed = 0;
+    // cppcheck-suppress variableScope
+    unsigned long seed = names.size();
+    auto last          = std::prev(this->end());
    names              = this->print(
        [&](auto ins, auto ins_names) {
-            auto op = cpp_op_var(ins_names.at(ins), ins);
-            if(ins->name().front() != '@')
-            {
-                os << "migraphx::op::" << ins->name() << " " << op << ";" << std::endl;
-                print_op_attributes(os, op, ins->get_operator());
-            }
-            os << "auto " << cpp_var_name(ins_names.at(ins)) << " = ";
+            std::vector<std::string> input_vars;
+            std::transform(ins->inputs().begin(),
+                           ins->inputs().end(),
+                           std::back_inserter(input_vars),
+                           [&](auto input) { return cpp_var_name(ins_names.at(input)); });
+            if(ins != last)
+                os << "auto " << cpp_var_name(ins_names.at(ins)) << " = ";
            if(ins->name() == "@literal")
            {
-                os << "p.add_literal(";
+                os << mname << "->add_literal(";
                bool use_abs = false;
                ins->get_literal().visit([&](auto v) {
                    use_abs = std::none_of(v.begin(), v.end(), [](auto x) { return x < 0; });
@@ -789,17 +844,22 @@ module::print_cpp(std::ostream& os, std::unordered_map<instruction_ref, std::str
            else if(ins->name() == "@param")
            {
                std::string name = any_cast<builtin::param>(ins->get_operator()).parameter;
-                os << "p.add_parameter(" << enclose_name(name) << ",";
+                os << mname << "->add_parameter(" << enclose_name(name) << ",";
                print_cpp_shape(os, ins->get_shape());
                os << ");" << std::endl;
            }
+            else if(ins->name() == "@return")
+            {
+                os << mname << "->add_return({";
+                os << join_strings(input_vars, ", ");
+                os << "});" << std::endl;
+            }
            else
            {
-                os << "p.add_instruction(" << op;
-                for(auto input : ins->inputs())
-                {
-                    os << ", " << cpp_var_name(ins_names.at(input));
-                }
+                assert(ins->name().front() != '@');
+                os << mname << "->add_instruction(";
+                print_make_op(os, ins->get_operator());
+                os << ", " << join_strings(input_vars, ", ");
                os << ");" << std::endl;
            }
        },
@@ -808,7 +868,7 @@ module::print_cpp(std::ostream& os, std::unordered_map<instruction_ref, std::str
    return names;
 }

-void module::print_cpp(std::ostream& os) const { this->print_cpp(os, {}); }
+void module::print_cpp(std::ostream& os) const { this->print_cpp(os, this->name(), {}); }

 void module::annotate(std::ostream& os, std::function<void(instruction_ref)> a) const
 {

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -504,12 +504,14 @@ static void mod_from_val(module_ref mod,

        if(name == "@param")
        {
-            output = mod->add_parameter(fields["parameter"].to<std::string>(),
-                                        migraphx::from_value<shape>(node.at("shape")));
+            output = mod->insert_parameter(mod->end(),
+                                           fields["parameter"].to<std::string>(),
+                                           migraphx::from_value<shape>(node.at("shape")));
        }
        else if(name == "@literal")
        {
-            output = mod->add_literal(migraphx::from_value<literal>(node.at("literal")));
+            output =
+                mod->insert_literal(mod->end(), migraphx::from_value<literal>(node.at("literal")));
        }
        else
        {
@@ -544,11 +546,11 @@ static void mod_from_val(module_ref mod,
            }
            else if(module_inputs.empty())
            {
-                output = mod->add_instruction(op, inputs);
+                output = mod->insert_instruction(mod->end(), op, inputs);
            }
            else
            {
-                output = mod->add_instruction(op, inputs, module_inputs);
+                output = mod->insert_instruction(mod->end(), op, inputs, module_inputs);
            }
        }
        output->set_normalized(normalized);
@@ -790,10 +792,17 @@ void program::print_cpp(std::ostream& os) const
 {
    auto vec_modules = this->get_modules();
    std::unordered_map<instruction_ref, std::string> names;
+    os << "migraphx::program p;\n";
    for(auto& mod : vec_modules)
    {
-        os << "module: \"" << mod->name() << "\"" << std::endl;
-        names = mod->print_cpp(os, names);
+        std::string var_name = "m" + mod->name();
+        os << "migraphx::module_ref " << var_name << " = ";
+        if(mod->name() == "main")
+            os << "p.get_main_module();";
+        else
+            os << "p.create_module(\"" << mod->name() << "\");";
+        os << std::endl;
+        names = mod->print_cpp(os, var_name, names);
        os << std::endl;
    }
 }

--- a/src/serialize.cpp
+++ b/src/serialize.cpp
@@ -36,7 +36,7 @@ void raw_data_to_value(value& v, const RawData& rd)
    result["shape"] = migraphx::to_value(rd.get_shape());
    if(rd.get_shape().type() == shape::tuple_type)
        result["sub"] = migraphx::to_value(rd.get_sub_objects());
-    else
+    else if(not rd.empty())
        result["data"] = migraphx::value::binary(rd.data(), rd.get_shape().bytes());
    v = result;
 }
@@ -56,7 +56,7 @@ void migraphx_from_value(const value& v, argument& a)
        literal l = migraphx::from_value<literal>(v);
        a         = l.get_argument();
    }
-    else
+    else if(v.contains("sub"))
    {
        a = migraphx::from_value<std::vector<argument>>(v.at("sub"));
    }

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -272,7 +272,7 @@ struct find_concat_transpose
 {
    auto matcher() const
    {
-        return match::name("concat")(match::all_of[match::inputs()](match::transpose_shape()));
+        return match::name("concat")(match::all_of[match::inputs()](match::name("transpose")));
    }

    void apply(module& m, const match::matcher_result& mr) const
@@ -601,6 +601,69 @@ struct find_transpose_contiguous_reshaper_unary
    }
 };

+struct find_slice_transpose
+{
+    auto matcher() const
+    {
+        return match::any(match::any_of[match::outputs()](
+            match::name("slice")(match::output(match::name("transpose")))));
+    }
+
+    static std::vector<int64_t> find_common_perm(const std::vector<instruction_ref>& transposes)
+    {
+        std::map<std::vector<int64_t>, int64_t> count;
+        for(auto t : transposes)
+        {
+            auto perm = t->get_operator().to_value()["permutation"].to_vector<int64_t>();
+            count[perm]++;
+        }
+        return std::max_element(
+                   count.begin(), count.end(), by(std::less<>{}, [](auto&& p) { return p.second; }))
+            ->first;
+    }
+
+    void apply(module& m, const match::matcher_result& r) const
+    {
+        auto ins = r.result;
+        std::vector<instruction_ref> splits;
+        std::copy_if(ins->outputs().begin(),
+                     ins->outputs().end(),
+                     std::back_inserter(splits),
+                     [&](instruction_ref out) {
+                         return out->name() == "slice" and out->outputs().size() == 1 and
+                                out->outputs().front()->name() == "transpose";
+                     });
+        if(splits.size() < 2)
+            return;
+        std::vector<instruction_ref> transposes;
+        std::transform(splits.begin(),
+                       splits.end(),
+                       std::back_inserter(transposes),
+                       [](auto split) { return split->outputs().front(); });
+        auto perm  = find_common_perm(transposes);
+        auto iperm = invert_permutation(perm);
+        auto pre   = m.insert_instruction(
+            std::next(ins), make_op("transpose", {{"permutation", perm}}), ins);
+        for(auto i : range(transposes.size()))
+        {
+            auto split = splits[i];
+            auto t     = transposes[i];
+            auto op    = any_cast<op::slice>(split->get_operator());
+            std::transform(op.axes.begin(), op.axes.end(), op.axes.begin(), [&](auto axis) {
+                return iperm[axis];
+            });
+            auto new_ins = m.insert_instruction(t, op, pre);
+            if(t->get_operator() != pre->get_operator())
+            {
+                auto curr = t->get_operator().to_value()["permutation"].to_vector<int64_t>();
+                new_ins   = m.insert_instruction(
+                    t, make_op("transpose", {{"permutation", reorder_dims(iperm, curr)}}), new_ins);
+            }
+            m.replace_instruction(t, new_ins);
+        }
+    }
+};
+
 void simplify_reshapes::apply(module& m) const
 {
    for(int i = 0; i < 2; i++)
@@ -616,6 +679,7 @@ void simplify_reshapes::apply(module& m) const
                            find_nested_convert{},
                            find_nested_slice{},
                            find_nested_concat{},
+                            find_slice_transpose{},
                            find_transpose_contiguous_reshaper_unary{});
        dead_code_elimination{}.apply(m);
    }

--- a/src/targets/cpu/write_literals.cpp
+++ b/src/targets/cpu/write_literals.cpp
@@ -25,6 +25,7 @@
 #include <migraphx/module.hpp>
 #include <migraphx/instruction.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/register_op.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -52,6 +53,7 @@ struct cpu_literal
        return os;
    }
 };
+MIGRAPHX_REGISTER_OP(cpu_literal);

 void write_literals::apply(module& m) const
 {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -164,6 +164,7 @@ add_library(migraphx_gpu
    deconvolution.cpp
    device_name.cpp
    elu.cpp
+    fuse_mlir.cpp
    fuse_ops.cpp
    gather.cpp
    gemm_impl.cpp
@@ -176,7 +177,7 @@ add_library(migraphx_gpu
    loop.cpp
    lrn.cpp
    leaky_relu.cpp
-    mlir_conv.cpp
+    mlir.cpp
    multinomial.cpp
    nonzero.cpp
    pack_args.cpp
@@ -320,16 +321,26 @@ message(STATUS "extractkernel: ${MIGRAPHX_EXTRACT_KERNEL}")

 set(MIGRAPHX_ENABLE_MLIR OFF CACHE BOOL "")
 if(MIGRAPHX_ENABLE_MLIR)
-    find_library(LIBMLIRMIOPEN MLIRMIOpenThin REQUIRED)
+    find_library(MLIRAPI_LIBRARY MLIRMIOpen 
+        PATH_SUFFIXES
+        # Workaournd broken mlir install
+        lib/ lib/lib)
    # REQUIRED is not supported before cmake 3.18
-    if(NOT LIBMLIRMIOPEN)
-        message(FATAL_ERROR "libMLIRMIOpenThin not found")
+    if(NOT MLIRAPI_LIBRARY)
+        message(FATAL_ERROR "libMLIRMIOpen not found")
    else()
-        message(STATUS "Build with libMLIRMIOpenThin: " ${LIBMLIRMIOPEN})
+        message(STATUS "Build with libMLIRMIOpen: " ${MLIRAPI_LIBRARY})
    endif()

-    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR_MIOPEN_SUPPORT")
-    target_link_libraries(migraphx_gpu PUBLIC ${LIBMLIRMIOPEN})
+    find_path(MLIRAPI_HEADERS NAMES mlir-c/Dialect/MIGraphX.h)
+    # Workaround MLIR broken installation
+    find_path(MLIRAPI_HEADERS2 NAMES mlir-c/Registration.h
+        PATH_SUFFIXES 
+        include/external/include external/include)
+
+    target_compile_definitions(migraphx_gpu PRIVATE "-DMIGRAPHX_MLIR")
+    target_include_directories(migraphx_gpu SYSTEM PRIVATE ${MLIRAPI_HEADERS} ${MLIRAPI_HEADERS2})
+    target_link_libraries(migraphx_gpu PUBLIC ${MLIRAPI_LIBRARY})
 endif()

 set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "")

--- a/src/targets/gpu/code_object_op.cpp
+++ b/src/targets/gpu/code_object_op.cpp
@@ -52,7 +52,7 @@ code_object_op::compute(context& ctx, const shape&, const std::vector<argument>&
    std::transform(
        args.begin(), args.end(), kargs.begin(), [](const argument& a) { return a.data(); });
    k.launch(ctx.get_stream().get(), global, local, std::move(kargs));
-    return args.back();
+    return args[get_output_arg(args.size())];
 }
 void code_object_op::finalize(context&, const shape&, const std::vector<shape>&)
 {

--- a/src/targets/gpu/compile_gen.cpp
+++ b/src/targets/gpu/compile_gen.cpp
@@ -43,6 +43,9 @@ static std::vector<std::size_t> vector_sizes(const std::vector<shape>& inputs)

 vectorize vectorize::elements(std::size_t axis, const std::vector<shape>& inputs)
 {
+    if(std::all_of(
+           inputs.begin(), inputs.end(), [&](const auto& s) { return s.lens()[axis] == 1; }))
+        return {1, axis};
    auto sizes = vector_sizes(inputs);
    std::vector<std::size_t> max_vec_size;
    std::transform(inputs.begin(),

--- a/src/targets/gpu/deconvolution.cpp
+++ b/src/targets/gpu/deconvolution.cpp
@@ -59,31 +59,30 @@ argument miopen_deconvolution::compute(context& ctx,
    auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()));
    auto y_desc = make_tensor(reshape_if_1d(output_shape));

-    float alpha = 1;
-    float beta  = 0;
-    auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
-                                           &alpha,
-                                           x_desc.get(),
-                                           args[0].implicit(),
-                                           w_desc.get(),
-                                           args[1].implicit(),
-                                           cd.get(),
-                                           algo,
-                                           &beta,
-                                           y_desc.get(),
-                                           args[3].implicit(),
-                                           args[2].implicit(),
-                                           args[2].get_shape().bytes());
+    if(solution_id == 0)
+        MIGRAPHX_THROW("MIOpen Deconvolution: invalid solution ID");
+
+    auto status = miopenConvolutionForwardImmediate(ctx.get_stream().get_miopen(),
+                                                    w_desc.get(),
+                                                    args[1].implicit(),
+                                                    x_desc.get(),
+                                                    args[0].implicit(),
+                                                    cd.get(),
+                                                    y_desc.get(),
+                                                    args[3].implicit(),
+                                                    args[2].implicit(),
+                                                    args[2].get_shape().bytes(),
+                                                    solution_id);
+
    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("Running deconvolution failed");
+        MIGRAPHX_THROW("MIOpen Deconvolution: running convolution failed");
    return args[3];
 }

-shape miopen_deconvolution::compile(context& ctx,
-                                    const shape& output_shape,
-                                    std::vector<shape> inputs)
+shape miopen_deconvolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
 {
    shape workspace_shape{};
+
    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
    auto y_desc = make_tensor(reshape_if_1d(output_shape));
@@ -119,9 +118,35 @@ shape miopen_deconvolution::compile(context& ctx,
                                                        workspace_size,
                                                        false);
    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("Find deconvolution failed");
-    handle = ctx.get_stream().get_miopen();
-    algo   = perf.fwd_algo;
+        MIGRAPHX_THROW("MIOpen Deconvolution: find convolution failed");
+    algo = perf.fwd_algo;
+
+    size_t solution_count;
+
+    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
+                                                      w_desc.get(),
+                                                      x_desc.get(),
+                                                      cd.get(),
+                                                      y_desc.get(),
+                                                      &solution_count);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Deconvolution: get solution count failed");
+
+    std::vector<miopenConvSolution_t> solutions(solution_count);
+
+    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
+                                                 w_desc.get(),
+                                                 x_desc.get(),
+                                                 cd.get(),
+                                                 y_desc.get(),
+                                                 solution_count,
+                                                 &solution_count,
+                                                 solutions.data());
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Deconvolution: get solution failed");
+
+    solution_id = solutions.front().solution_id;
+
    return shape{shape::int8_type, {perf.memory}};
 }

@@ -129,13 +154,29 @@ void miopen_deconvolution::finalize(context& ctx,
                                    const shape& output_shape,
                                    std::vector<shape> inputs)
 {
-    if(handle == ctx.get_stream().get_miopen())
-        return;
-    // Check that workspace hasn't changed
-    auto size = inputs.at(2).bytes();
-    auto ws   = compile(ctx, output_shape, std::move(inputs));
-    if(ws.bytes() > size)
-        MIGRAPHX_THROW("Workspace has changed during finalization.");
+    if(cd == nullptr)
+        cd = make_deconv(op);
+    if(solution_id == 0)
+    {
+        // Check that workspace hasn't changed
+        auto size = inputs.at(2).bytes();
+        auto ws   = find(ctx, output_shape, inputs);
+        if(ws.bytes() > size)
+            MIGRAPHX_THROW("MIOpen Deconvolution: workspace has changed during finalization.");
+    }
+
+    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
+    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
+    auto y_desc = make_tensor(reshape_if_1d(output_shape));
+
+    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+                                                          w_desc.get(),
+                                                          x_desc.get(),
+                                                          cd.get(),
+                                                          y_desc.get(),
+                                                          solution_id);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen Deconvolution: compile solution failed");
 }

 } // namespace gpu

--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/fuse_mlir.hpp>
+#include <migraphx/gpu/mlir.hpp>
+#include <migraphx/matcher.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/register_op.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct module;
+
+namespace gpu {
+
+#ifdef MIGRAPHX_MLIR
+struct mlir_conv
+{
+    operation op = make_op("convolution");
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.op, "op"));
+    }
+
+    std::string name() const { return "gpu::mlir_conv"; }
+    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
+    {
+        check_shapes{inputs, *this}.standard();
+        if(mods.size() != 1)
+            MIGRAPHX_THROW("should have one submodule.");
+        if(inputs.size() < 2)
+            MIGRAPHX_THROW("should have at least two inputs.");
+        auto n = inputs.size();
+        return op.compute_shape({inputs[n - 2], inputs[n - 1]});
+    }
+};
+MIGRAPHX_REGISTER_OP(mlir_conv);
+
+namespace {
+struct find_conv_pointwise
+{
+    // Find a convolution followed by a pointwise operation.
+    auto matcher() const
+    {
+        auto convolution =
+            match::skip(match::name("contiguous"))(match::name("convolution").bind("convolution"));
+        return match::name("pointwise")(match::any_of[match::inputs()](convolution.bind("x")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto ins      = r.result;
+        auto conv_ins = r.instructions["convolution"];
+        auto x_ins    = r.instructions["x"]; // input after contiguous
+        auto* pm      = ins->module_inputs().front();
+        auto names    = pm->get_parameter_names();
+        // Whitelist pointwise operators
+        if(std::any_of(pm->begin(), pm->end(), [](const auto& i) {
+               return not contains({"@literal", "@param", "@return", "convolution", "add", "relu"},
+                                   i.name());
+           }))
+            return;
+        // Only fuse with fp32 for now
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
+               return i->get_shape().type() != shape::type_t::float_type;
+           }))
+            return;
+        std::sort(names.begin(), names.end());
+        module_ref mm = mpm.create_module("mlir_" + pm->name());
+        mm->set_bypass();
+        std::unordered_map<instruction_ref, instruction_ref> param_map;
+        auto x    = mm->add_parameter("x" + std::to_string(names.size()),
+                                   conv_ins->inputs().at(0)->get_shape());
+        auto w    = mm->add_parameter("x" + std::to_string(names.size() + 1),
+                                   conv_ins->inputs().at(1)->get_shape());
+        auto conv = mm->add_instruction(conv_ins->get_operator(), {x, w});
+        std::transform(names.begin(),
+                       names.end(),
+                       ins->inputs().begin(),
+                       std::inserter(param_map, param_map.end()),
+                       [&](auto name, auto input) {
+                           if(input == x_ins)
+                               return std::make_pair(pm->get_parameter(name), conv);
+                           return std::make_pair(pm->get_parameter(name),
+                                                 mm->add_parameter(name, input->get_shape()));
+                       });
+        mm->add_return(mm->insert_instructions(mm->end(), pm, param_map));
+
+        std::vector<instruction_ref> inputs;
+        std::copy_if(ins->inputs().begin(),
+                     ins->inputs().end(),
+                     std::back_inserter(inputs),
+                     [&](auto input) { return input != conv_ins; });
+        inputs.insert(inputs.end(), conv_ins->inputs().begin(), conv_ins->inputs().end());
+        mpm.get_module().replace_instruction(
+            ins, mlir_conv{conv_ins->get_operator()}, inputs, {mm});
+    }
+};
+} // namespace
+
+#endif
+
+void fuse_mlir::apply(module_pass_manager& mpm) const
+{
+#ifdef MIGRAPHX_MLIR
+    match::find_matches(mpm, find_conv_pointwise{});
+#else
+    (void)mpm;
+#endif
+}
+
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -336,6 +336,7 @@ void move_standard_front(std::vector<instruction_ref>& args)

 auto gpu_name(const std::string& s) { return match::name("gpu::" + s); }

+namespace {
 struct find_layernorm
 {
    auto matcher() const { return match::layernorm(&gpu_name); }
@@ -836,15 +837,6 @@ inline auto precompile_name(std::string s) // NOLINT
    });
 }

-template <class... Ms>
-auto conv_bias_pointwise(Ms... ms)
-{
-    return precompile_name("pointwise")(
-        match::either_arg(0, 1)(bias_shape(match::used_once()).bind("bias"),
-                                fusable_conv(match::used_once()).bind("conv")),
-        ms...);
-}
-
 struct find_conv_bias
 {
    context* ctx = nullptr;
@@ -1013,6 +1005,7 @@ struct find_commutative_broadcast
        m.replace_instruction(ins, ins->get_operator(), args);
    }
 };
+} // namespace

 struct find_contiguous
 {

--- a/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/code_object_op.hpp
@@ -38,12 +38,13 @@ struct context;

 struct code_object_op
 {
-    value::binary code_object;
-    std::string symbol_name;
-    std::size_t global;
-    std::size_t local;
-    std::vector<shape> expected_inputs;
-    shape output;
+    value::binary code_object{};
+    std::string symbol_name = "";
+    std::size_t global      = 0;
+    std::size_t local       = 0;
+    std::vector<shape> expected_inputs{};
+    shape output{};
+    std::int64_t output_arg = -1;
    kernel k{};

    template <class Self, class F>
@@ -66,9 +67,13 @@ struct code_object_op
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
    void finalize(context&, const shape&, const std::vector<shape>&);
+    std::int64_t get_output_arg(std::size_t n) const
+    {
+        return output_arg < 0 ? n + output_arg : output_arg;
+    }
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
-        return shapes.size() - 1;
+        return get_output_arg(shapes.size());
    }

    friend std::ostream& operator<<(std::ostream& os, const code_object_op& op)

--- a/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
@@ -39,20 +39,20 @@ struct miopen_deconvolution
    op::deconvolution op;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
-    miopenHandle_t handle = nullptr;
+    uint64_t solution_id = 0;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        // TODO: Add algo
-        return op::convolution::reflect(self.op, f);
+        return pack_join(op::deconvolution::reflect(self.op, f),
+                         pack(f(self.solution_id, "solution_id")));
    }

    std::string name() const { return "gpu::deconv"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {