Merge branch 'develop' into fuse-horiz-contiguous

b8deb54c · Paul Fultz II · GitHub · fee84355 · ca8a54fe · b8deb54c
Unverified Commit b8deb54c authored Jul 03, 2022 by Paul Fultz II Committed by GitHub Jul 03, 2022
11 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/algorithm.hpp
@@ -49,7 +49,7 @@ constexpr T accumulate(InputIt first, InputIt last, T init, BinaryOperation op)
 {
    for(; first != last; ++first)
    {
-        init = op(std::move(init), *first);
+        init = op(static_cast<T&&>(init), *first);
    }
    return init;
 }
@@ -64,6 +64,20 @@ constexpr OutputIt copy(InputIt first, InputIt last, OutputIt d_first)
    return d_first;
 }

+template <class InputIt, class OutputIt, class UnaryPredicate>
+constexpr OutputIt copy_if(InputIt first, InputIt last, OutputIt d_first, UnaryPredicate pred)
+{
+    for(; first != last; ++first)
+    {
+        if(pred(*first))
+        {
+            *d_first = *first;
+            ++d_first;
+        }
+    }
+    return d_first;
+}
+
 template <class Iterator, class Compare>
 constexpr Iterator is_sorted_until(Iterator first, Iterator last, Compare comp)
 {
@@ -115,6 +129,24 @@ constexpr Iterator find(Iterator first, Iterator last, const T& value)
    return find_if(first, last, [&](const auto& x) { return x == value; });
 }

+template <class InputIt, class UnaryPredicate>
+constexpr bool any_of(InputIt first, InputIt last, UnaryPredicate p)
+{
+    return find_if(first, last, p) != last;
+}
+
+template <class InputIt, class UnaryPredicate>
+constexpr bool none_of(InputIt first, InputIt last, UnaryPredicate p)
+{
+    return find_if(first, last, p) == last;
+}
+
+template <class InputIt, class UnaryPredicate>
+constexpr bool all_of(InputIt first, InputIt last, UnaryPredicate p)
+{
+    return none_of(first, last, [=](auto&& x) { return not p(x); });
+}
+
 template <class Iterator1, class Iterator2>
 constexpr Iterator1 search(Iterator1 first, Iterator1 last, Iterator2 s_first, Iterator2 s_last)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -41,8 +41,15 @@ struct implicit_conversion_op
    template <index_int N, class U>
    constexpr operator vec<U, N>() const
    {
-        static_assert(vec_size<T>() == N, "Vector mismatch size");
-        return __builtin_convertvector(x, vec<U, N>);
+        if constexpr(vec_size<T>() == 0)
+        {
+            return x;
+        }
+        else
+        {
+            static_assert(vec_size<T>() == N, "Vector mismatch size");
+            return __builtin_convertvector(x, vec<U, N>);
+        }
    }

    template <class U>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/shape.hpp
@@ -44,7 +44,7 @@ struct shape

    constexpr auto element_space() const { return _c<Strides{}.dot(Lens{} - 1) + 1>; }

-    constexpr auto packed() const { return elements() == element_space(); }
+    constexpr auto packed() const { return not skips() and elements() == element_space(); }
    constexpr auto broadcasted() const { return _c<Strides{}.product() == 0>; }
    constexpr auto transposed() const
    {
@@ -53,16 +53,9 @@ struct shape
            if(shape{}.broadcasted())
            {
                index_array s{};
-                index_int j = 0;
-                for(index_int i = 0; i < s.size(); i++)
-                {
-                    if(lstrides[i] != 0)
-                    {
-                        s[j] = lstrides[i];
-                        j++;
-                    }
-                }
-                return not is_sorted(s.begin(), s.begin() + j, greater{});
+                auto out = copy_if(
+                    lstrides.begin(), lstrides.end(), s.begin(), [](auto x) { return x != 0; });
+                return not is_sorted(s.begin(), out, greater{});
            }
            else
            {
@@ -70,6 +63,13 @@ struct shape
            }
        });
    }
+    constexpr auto skips() const
+    {
+        return return_c([] {
+            auto lstrides = Strides{};
+            return none_of(lstrides.begin(), lstrides.end(), [](auto x) { return x == 1; });
+        });
+    }

    constexpr auto standard() const { return packed() and not transposed(); }

@@ -86,26 +86,34 @@ struct shape
    constexpr index_int index(index_int i) const
    {
        if(this->standard())
+        {
+            MIGRAPHX_ASSERT(i == compute_index(i));
            return i;
+        }
        else
        {
-            const auto rank  = this->lens.size();
-            index_int s      = 1;
-            index_int result = 0;
-            for(index_int j = 0; j < rank; j++)
-            {
-                const index_int k      = rank - j - 1;
-                const index_int stride = this->strides[k];
-                const index_int len    = this->lens[k];
-                const index_int slen   = s * len;
-                const index_int idx    = (i % slen) / s;
-                result += stride * idx;
-                s = slen;
-            }
-            return result;
+            return compute_index(i);
        }
    }

+    constexpr index_int compute_index(index_int i) const
+    {
+        const auto rank  = this->lens.size();
+        index_int s      = 1;
+        index_int result = 0;
+        for(index_int j = 0; j < rank; j++)
+        {
+            const index_int k      = rank - j - 1;
+            const index_int stride = this->strides[k];
+            const index_int len    = this->lens[k];
+            const index_int slen   = s * len;
+            const index_int idx    = (i % slen) / s;
+            result += stride * idx;
+            s = slen;
+        }
+        return result;
+    }
+
    /// Convert single index into a multi-index
    constexpr index_array multi(index_int idx) const
    {

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/mlir.hpp>
+
+#ifdef MIGRAPHX_MLIR
+#include <mlir-c/IR.h>
+#include <mlir-c/BuiltinAttributes.h>
+#include <mlir-c/BuiltinTypes.h>
+#include <mlir-c/Diagnostics.h>
+#include <mlir-c/Dialect/MIGraphX.h>
+#include <mlir-c/IntegerSet.h>
+#include <mlir-c/Pass.h>
+#include <mlir-c/Registration.h>
+#endif
+
+#include <migraphx/env.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/gpu/code_object_op.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device_name.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <deque>
+#include <variant>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_MLIR);
+
+#ifdef MIGRAPHX_MLIR
+template <class T, class F, F f> // NOLINT
+struct mlir_handle
+{
+    struct ptr
+    {
+        ptr() = default;
+        ptr(std::nullptr_t) {}
+        ptr(T x) : obj(x) {}
+
+        std::intptr_t get_value() const
+        {
+            static_assert(sizeof(T) == sizeof(std::intptr_t), "MLIR Handle different size");
+            return reinterpret_cast<const std::intptr_t&>(obj);
+        }
+
+        T get() const { return obj; }
+
+        friend bool operator==(ptr x, ptr y) { return x.get_value() == y.get_value(); }
+
+        friend bool operator!=(ptr x, ptr y) { return !(x == y); }
+        T obj{};
+    };
+
+    struct deleter
+    {
+        using pointer = ptr;
+
+        void operator()(pointer x) const
+        {
+            if(x != nullptr)
+            {
+                (void)f(x.obj);
+            }
+        }
+    };
+
+    mlir_handle() : handle(nullptr) {}
+
+    mlir_handle(T p) : handle(ptr{p}) {}
+
+    T get() const { return handle.get().get(); }
+
+    T release() { return handle.release().get(); }
+
+    private:
+    std::unique_ptr<ptr, deleter> handle;
+};
+
+#define MIGRAPHX_MANAGE_MLIR_HANDLE(T, F) migraphx::gpu::mlir_handle<T, decltype(&F), &F> // NOLINT
+
+using mlir_context           = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirContext, mlirContextDestroy);
+using mlir_module            = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirModule, mlirModuleDestroy);
+using mlir_operation         = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOperation, mlirOperationDestroy);
+using mlir_op_printing_flags = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirOpPrintingFlags,
+                                                           mlirOpPrintingFlagsDestroy);
+using mlir_region            = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirRegion, mlirRegionDestroy);
+using mlir_block             = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirBlock, mlirBlockDestroy);
+using mlir_pass_manager      = MIGRAPHX_MANAGE_MLIR_HANDLE(MlirPassManager, mlirPassManagerDestroy);
+
+std::string_view to_string_view(MlirStringRef s) { return {s.data, s.length}; }
+
+MlirStringRef make_mlir_string_ref(const std::string_view& s)
+{
+    return mlirStringRefCreate(s.data(), s.size());
+}
+
+template <class F, class T, class Printer>
+void mlir_print(F f, T x, Printer printer)
+{
+    f(
+        x,
+        +[](MlirStringRef s, void* data) {
+            (*reinterpret_cast<Printer*>(data))(to_string_view(s));
+        },
+        &printer);
+}
+
+template <class F, class T>
+void mlir_print(F f, T x, std::ostream& os)
+{
+    mlir_print(f, x, [&](auto s) { os << s; });
+}
+
+template <class F, class T>
+std::string mlir_print(F f, T x)
+{
+    std::stringstream ss;
+    mlir_print(f, x, [&](auto s) { ss << s; });
+    return ss.str();
+}
+
+struct mlir_program
+{
+    mlir_program()
+        : ctx(mlirContextCreate()),
+          location(mlirLocationUnknownGet(ctx.get())),
+          mmodule(mlirModuleCreateEmpty(location))
+    {
+        MlirDialectHandle mixr_handle = mlirGetDialectHandle__migraphx__();
+        mlirDialectHandleRegisterDialect(mixr_handle, ctx.get());
+        mlirRegisterAllDialects(ctx.get());
+        mlirContextSetAllowUnregisteredDialects(ctx.get(), true /*allow*/);
+    }
+
+    MlirType make_type(shape::type_t t) const
+    {
+        MlirType result;
+        shape::visit(t, [&](auto as) {
+            if(as.type_enum() == shape::float_type)
+                result = mlirF32TypeGet(ctx.get());
+            else if(as.type_enum() == shape::half_type)
+                result = mlirF16TypeGet(ctx.get());
+            else if(as.type_enum() == shape::double_type)
+                result = mlirF64TypeGet(ctx.get());
+            else if(as.is_integral())
+            {
+                if(as.is_signed())
+                    result = mlirIntegerTypeSignedGet(ctx.get(), as.size() * 8);
+                else
+                    result = mlirIntegerTypeGet(ctx.get(), as.size() * 8);
+            }
+            else
+                MIGRAPHX_THROW("Unsupported type: " + std::to_string(as.type_enum()));
+        });
+        return result;
+    }
+
+    MlirType make_tensor(const shape& s) const
+    {
+        assert(s.standard());
+        std::vector<int64_t> lens(s.lens().begin(), s.lens().end());
+        return mlirRankedTensorTypeGet(
+            lens.size(), lens.data(), make_type(s.type()), mlirAttributeGetNull());
+    }
+
+    template <class Range>
+    std::vector<MlirType> make_tensors(const Range& r)
+    {
+        std::vector<MlirType> result;
+        std::transform(r.begin(), r.end(), std::back_inserter(result), [&](const auto& s) {
+            return make_tensor(s);
+        });
+        return result;
+    }
+
+    MlirType make_function_type(const std::vector<shape>& inputs, const std::vector<shape>& outputs)
+    {
+        auto in  = make_tensors(inputs);
+        auto out = make_tensors(outputs);
+        return mlirFunctionTypeGet(ctx.get(), in.size(), in.data(), out.size(), out.data());
+    }
+
+    MlirIdentifier id(const std::string_view& s) const
+    {
+        return mlirIdentifierGet(ctx.get(), make_mlir_string_ref(s));
+    }
+
+    MlirAttribute attribute(std::int64_t i) const
+    {
+        if(i < 0)
+            MIGRAPHX_THROW("MLIR cant handle negative values since they are ambiguous");
+        return mlirIntegerAttrGet(mlirIntegerTypeGet(ctx.get(), 64), i);
+    }
+    MlirAttribute attribute(std::uint64_t i) const
+    {
+        if(i > (std::numeric_limits<std::uint64_t>::max() / 2))
+            MIGRAPHX_THROW("MLIR cant handle large integer values since they are ambiguous");
+        return mlirIntegerAttrGet(mlirIntegerTypeGet(ctx.get(), 64), i);
+    }
+    MlirAttribute attribute(unsigned char i) const { return attribute(std::uint64_t(i)); }
+    MlirAttribute attribute(bool b) const { return mlirBoolAttrGet(ctx.get(), b ? 1 : 0); }
+    MlirAttribute attribute(double d) const
+    {
+        return mlirFloatAttrDoubleGet(ctx.get(), mlirF64TypeGet(ctx.get()), d);
+    }
+    MlirAttribute attribute(const std::string& s) const
+    {
+        return mlirStringAttrGet(ctx.get(), make_mlir_string_ref(s));
+    }
+    MlirAttribute attribute(std::nullptr_t) const { return {}; }
+    template <class T>
+    MlirAttribute attribute(const std::vector<T>& v) const
+    {
+        std::vector<MlirAttribute> attributes;
+        attributes.reserve(v.size());
+        std::transform(v.begin(), v.end(), std::back_inserter(attributes), [&](auto&& x) {
+            return attribute(x);
+        });
+        return mlirArrayAttrGet(ctx.get(), attributes.size(), attributes.data());
+    }
+    MlirAttribute attribute(const value& v) const
+    {
+        MlirAttribute attr;
+        v.visit_value([&](auto&& x) { attr = attribute(x); });
+        return attr;
+    }
+    MlirAttribute attribute(const std::vector<value>& v) const
+    {
+        if(v.empty())
+        {
+            return mlirArrayAttrGet(ctx.get(), 0, nullptr);
+        }
+        if(not v.front().get_key().empty())
+        {
+            std::vector<MlirNamedAttribute> attributes = name_attributes(v);
+            return mlirDictionaryAttrGet(ctx.get(), attributes.size(), attributes.data());
+        }
+        else
+        {
+            std::vector<MlirAttribute> attributes;
+            attributes.reserve(v.size());
+            std::transform(v.begin(), v.end(), std::back_inserter(attributes), [&](auto&& x) {
+                return attribute(x);
+            });
+            return mlirArrayAttrGet(ctx.get(), attributes.size(), attributes.data());
+        }
+    }
+
+    MlirAttribute attribute(MlirType t) const { return mlirTypeAttrGet(t); }
+
+    MlirAttribute attribute(MlirAttribute a) const { return a; }
+
+    template <class T>
+    MlirNamedAttribute name_attribute(const std::string_view& key, const T& x) const
+    {
+        MlirNamedAttribute attr;
+        attr.name      = id(key);
+        attr.attribute = attribute(x);
+        return attr;
+    }
+
+    using attribute_t       = std::variant<std::nullptr_t,
+                                     std::uint64_t,
+                                     unsigned char,
+                                     bool,
+                                     double,
+                                     std::string,
+                                     value,
+                                     std::vector<value>,
+                                     MlirType>;
+    using named_attribute_t = std::pair<std::string_view, attribute_t>;
+
+    MlirNamedAttribute name_attribute(const named_attribute_t& na) const
+    {
+        return name_attribute(na.first,
+                              std::visit([&](const auto& x) { return attribute(x); }, na.second));
+    }
+
+    std::vector<MlirNamedAttribute>
+    name_attributes(const std::vector<named_attribute_t>& named_attrs) const
+    {
+        std::vector<MlirNamedAttribute> attributes;
+        attributes.reserve(named_attrs.size());
+        std::transform(named_attrs.begin(),
+                       named_attrs.end(),
+                       std::back_inserter(attributes),
+                       [&](const named_attribute_t& a) { return name_attribute(a); });
+        return attributes;
+    }
+
+    std::vector<MlirNamedAttribute> name_attributes(const value& v) const
+    {
+        std::vector<MlirNamedAttribute> attributes;
+        attributes.reserve(v.size());
+        std::transform(v.begin(), v.end(), std::back_inserter(attributes), [&](const value& x) {
+            return name_attribute(x.get_key(), x.without_key());
+        });
+        return attributes;
+    }
+
+    struct mlir_operation_state
+    {
+        mlir_operation_state(mlir_program& p, const std::string_view& name)
+            : prog(&p), op_state(mlirOperationStateGet(make_mlir_string_ref(name), p.location))
+        {
+        }
+
+        mlir_operation_state& add_attributes(const std::vector<named_attribute_t>& named_attrs)
+        {
+            auto attributes = prog->name_attributes(named_attrs);
+            mlirOperationStateAddAttributes(&op_state, attributes.size(), attributes.data());
+            return *this;
+        }
+
+        mlir_operation_state& add_attribute_value(const value& v)
+        {
+            auto attributes = prog->name_attributes(v);
+            mlirOperationStateAddAttributes(&op_state, attributes.size(), attributes.data());
+            return *this;
+        }
+
+        mlir_operation_state& add_regions(std::vector<mlir_region> rs)
+        {
+            regions = std::move(rs);
+            return *this;
+        }
+
+        mlir_operation_state& add_region(mlir_region r)
+        {
+            regions.emplace_back(std::move(r));
+            return *this;
+        }
+
+        mlir_operation_state& add_results(const std::vector<shape>& outputs)
+        {
+            auto x = prog->make_tensors(outputs);
+            mlirOperationStateAddResults(&op_state, x.size(), x.data());
+            return *this;
+        }
+
+        mlir_operation_state& add_operands(const std::vector<MlirValue>& inputs)
+        {
+            mlirOperationStateAddOperands(&op_state, inputs.size(), inputs.data());
+            return *this;
+        }
+
+        mlir_operation create_operation()
+        {
+            std::vector<MlirRegion> mregions(regions.size());
+            std::transform(regions.begin(), regions.end(), mregions.begin(), [](const auto& r) {
+                return r.get();
+            });
+            mlirOperationStateAddOwnedRegions(&op_state, mregions.size(), mregions.data());
+            mlir_operation op(mlirOperationCreate(&op_state));
+            // Release memory since mlir_operation owns it
+            for(auto& r : regions)
+                r.release();
+            regions.clear();
+            return op;
+        }
+
+        mlir_program* prog;
+        MlirOperationState op_state;
+        std::vector<mlir_region> regions = {};
+    };
+
+    mlir_operation_state create_operation_state(const std::string_view& name)
+    {
+        return {*this, name};
+    }
+
+    std::vector<MlirValue> insert(MlirBlock body, mlir_operation_state ops)
+    {
+        std::vector<MlirValue> result;
+        mlir_operation op = ops.create_operation();
+        auto weak_op      = op.get();
+        mlirBlockAppendOwnedOperation(body, op.release());
+
+        auto n = mlirOperationGetNumResults(weak_op);
+        result.reserve(n);
+        transform(range(n), std::back_inserter(result), [&](auto i) {
+            return mlirOperationGetResult(weak_op, i);
+        });
+        return result;
+    }
+
+    MlirBlock
+    insert(MlirBlock body, const module& m, std::unordered_map<instruction_ref, MlirValue>& ins_map)
+    {
+        auto names = m.get_parameter_names();
+        std::sort(names.begin(), names.end());
+        std::vector<shape> inputs;
+        std::transform(names.begin(),
+                       names.end(),
+                       std::back_inserter(inputs),
+                       [&](const std::string& name) { return m.get_parameter_shape(name); });
+        std::vector<shape> outputs = m.get_output_shapes();
+
+        std::vector<MlirLocation> arg_locs(inputs.size(), location);
+        auto body_inputs   = make_tensors(inputs);
+        mlir_region region = mlirRegionCreate();
+        mlir_block fbody = mlirBlockCreate(body_inputs.size(), body_inputs.data(), arg_locs.data());
+        MlirBlock result = fbody.get();
+        mlirRegionAppendOwnedBlock(region.get(), fbody.release());
+
+        auto ops = create_operation_state("func.func");
+        ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
+                            {"sym_name", std::string("main")},
+                            {"kernel", std::string("mixr")}});
+        ops.add_region(std::move(region));
+        insert(body, std::move(ops));
+
+        for(auto i : range(names.size()))
+            ins_map[m.get_parameter(names[i])] = mlirBlockGetArgument(result, i);
+        return result;
+    }
+
+    static std::string get_name(instruction_ref ins)
+    {
+        if(ins->name() == "@return")
+            return "func.return";
+        return "migraphx." + ins->name();
+    }
+
+    static value get_operator_value(const operation& op)
+    {
+        auto v = op.to_value();
+        if(op.name() == "convolution")
+        {
+            // Adjust symetrical padding
+            if(v.at("padding").size() == v.at("stride").size())
+            {
+                auto padding = v.at("padding");
+                std::copy(padding.begin(), padding.end(), std::back_inserter(v.at("padding")));
+            }
+        }
+        return v;
+    }
+
+    static shape get_shape(instruction_ref ins)
+    {
+        if(ins->name() == "@return")
+        {
+            assert(ins->inputs().size() == 1);
+            return ins->inputs().front()->get_shape();
+        }
+        return ins->get_shape();
+    }
+
+    void parse(const module& m)
+    {
+        auto mbody = mlirModuleGetBody(mmodule.get());
+        std::unordered_map<instruction_ref, MlirValue> ins_map;
+        auto fbody = insert(mbody, m, ins_map);
+        for(auto ins : iterator_for(m))
+        {
+            if(ins->name() == "@param")
+                continue;
+            auto name = get_name(ins);
+            auto ops  = create_operation_state(name);
+            ops.add_attribute_value(get_operator_value(ins->get_operator()));
+            if(ins->name() != "@return")
+                ops.add_results({get_shape(ins)});
+
+            std::vector<MlirValue> inputs;
+            transform(
+                ins->inputs(), std::back_inserter(inputs), [&](auto i) { return ins_map.at(i); });
+            ops.add_operands(inputs);
+
+            auto outputs = insert(fbody, std::move(ops));
+            if(ins->name() != "@return")
+            {
+                assert(outputs.size() == 1);
+                ins_map[ins] = outputs.front();
+            }
+        }
+    }
+
+    code_object_op compile() MIGRAPHX_TIDY_CONST
+    {
+        mlir_pass_manager pm{mlirPassManagerCreate(ctx.get())};
+        // 1st pipeline to call
+        mlirMIGraphXAddHighLevelPipeline(pm.get());
+        // 2nd pipeline to call
+        std::string tname = get_device_name();
+        // HACK: Since MLIR can't handle the full target name
+        auto hacked_tname = tname.substr(0, tname.find(':'));
+        if(tname.size() != hacked_tname.size())
+            std::cout
+                << "*************** WARNING: MLIR may not compile the correct target features for: "
+                << tname << std::endl;
+        mlirMIGraphXAddBackendPipeline(pm.get(), hacked_tname.c_str(), "amdgcn-amd-amdhsa", "");
+        mlirPassManagerRun(pm.get(), mmodule.get());
+
+        code_object_op op{};
+        op.symbol_name                = "main";
+        op.code_object                = get_binary();
+        std::tie(op.global, op.local) = get_launch_params();
+        return op;
+    }
+
+    std::pair<std::size_t, std::size_t> get_launch_params() const
+    {
+        uint32_t attrs[2];
+        // returns block and grid sizes
+        mlirGetKernelAttrs(mmodule.get(), attrs);
+        std::size_t local  = attrs[0];
+        std::size_t global = local * attrs[1];
+        return {global, local};
+    }
+
+    value::binary get_binary() const
+    {
+        int size = 0;
+        mlirGetBinary(mmodule.get(), &size, nullptr);
+        value::binary result(size);
+        if(mlirGetBinary(mmodule.get(), &size, reinterpret_cast<char*>(result.data())))
+            return result;
+        MIGRAPHX_THROW("Failed to compile mlir program");
+    }
+
+    mlir_context ctx;
+    MlirLocation location;
+    mlir_module mmodule;
+    std::deque<std::string> strings{};
+};
+
+std::string dump_mlir(const module& m)
+{
+    mlir_program mp;
+    mp.parse(m);
+    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
+    return mlir_print(&mlirOperationPrint, mod_op);
+}
+
+code_object_op compile_mlir(const context&, const module& m)
+{
+    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
+    if(trace)
+        std::cout << m << std::endl;
+    mlir_program mp;
+    mp.parse(m);
+    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
+    if(trace)
+        std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
+    auto co   = mp.compile();
+    co.output = m.get_output_shapes().front();
+    return co;
+}
+
+instruction_ref insert_mlir(module& m,
+                            instruction_ref ins,
+                            code_object_op co,
+                            const std::vector<instruction_ref>& inputs)
+{
+    std::vector<instruction_ref> refs;
+    refs.reserve(inputs.size() * 15);
+
+    std::unordered_map<uint64_t, instruction_ref> literal_map{};
+    auto get_literal = [&](uint64_t value) {
+        auto fi = literal_map.find(value);
+        if(fi != literal_map.end())
+            return fi->second;
+        auto lit = m.add_literal(value);
+        literal_map.emplace(value, lit);
+        return lit;
+    };
+
+    std::size_t last = 0;
+    for(auto input : inputs)
+    {
+        const size_t offset = 0;
+        auto s              = input->get_shape();
+        last                = refs.size();
+        refs.push_back(input);
+        refs.push_back(input);
+        refs.push_back(get_literal(offset)); // offset
+
+        // dim sizes
+        std::transform(s.lens().begin(),
+                       s.lens().end(),
+                       std::back_inserter(refs),
+                       [&](const auto& lval) { return get_literal(lval); });
+        // refs.push_back(get_literal(1)); // G
+
+        // dim strides
+        std::transform(s.strides().begin(),
+                       s.strides().end(),
+                       std::back_inserter(refs),
+                       [&](const auto& lval) { return get_literal(lval); });
+        // refs.push_back(get_literal(1)); // G
+    }
+    co.expected_inputs = to_shapes(refs);
+    co.output_arg      = last;
+    return m.insert_instruction(ins, co, refs);
+}
+
+#else
+
+std::string dump_mlir(const module&) { return {}; }
+
+code_object_op compile_mlir(const context&, const module&) { return {}; }
+
+template <class T>
+void use(T&)
+{
+}
+
+instruction_ref
+// cppcheck-suppress funcArgNamesDifferent
+insert_mlir(module& m, instruction_ref, code_object_op co, const std::vector<instruction_ref>&)
+{
+    use(co);
+    return m.end();
+}
+
+#endif
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/mlir_conv.cpp
+++ b/src/targets/gpu/mlir_conv.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/mlir_conv.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/make_op.hpp>
-
-#include <migraphx/op/convolution.hpp>
-
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/convolution.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/program.hpp>
-
-#include <migraphx/ranges.hpp>
-#include <migraphx/make_op.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/program.hpp>
-#include <migraphx/gpu/kernel.hpp>
-#include <migraphx/gpu/target.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/gpu/compile_hip.hpp>
-
-#include <utility>
-#include <functional>
-#include <algorithm>
-
-#ifdef MIGRAPHX_MLIR_MIOPEN_SUPPORT
-#include <Miir.h>
-#endif // MIGRAPHX_MLIR_MIOPEN_SUPPORT
-
-#include <cstdio>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct mlir_apply
-{
-    module* mod           = nullptr;
-    const mlir_conv* pass = nullptr;
-
-    const char* mlir_kernel_name = "migraphx_conv2d";
-
-    std::unordered_map<uint64_t, instruction_ref> literal_map{};
-
-    struct execution_spec
-    {
-        migraphx::value::binary binary;
-        size_t global_size;
-        size_t local_size;
-        execution_spec(migraphx::value::binary&& binary_m, size_t global_s, size_t local_s)
-            : binary(std::move(binary_m)), global_size(global_s), local_size(local_s)
-        {
-        }
-    };
-
-    std::unordered_map<std::string, std::shared_ptr<execution_spec>> binary_map{};
-
-    context& get_context() const
-    {
-        assert(pass != nullptr);
-        assert(pass->ctx != nullptr);
-        return *pass->ctx;
-    }
-
-    void init() const
-    {
-        assert(mod != nullptr);
-        assert(pass != nullptr);
-    }
-
-    std::shared_ptr<execution_spec> make_mlir_binary(instruction_ref op_r)
-    {
-        std::shared_ptr<execution_spec> result;
-
-#ifdef MIGRAPHX_MLIR_MIOPEN_SUPPORT
-        auto conv  = any_cast<op::convolution>(op_r->get_operator());
-        auto inp_t = op_r->inputs().at(0)->get_shape();
-        auto flt_t = op_r->inputs().at(1)->get_shape();
-        auto out_t = op_r->get_shape();
-
-        auto get_type_str = [](const shape& s) -> const char* {
-            switch(s.type())
-            {
-            case shape::float_type: return "f32";
-            case shape::half_type: return "f16";
-            case shape::bool_type:
-            case shape::double_type:
-            case shape::uint8_type:
-            case shape::int8_type:
-            case shape::uint16_type:
-            case shape::int16_type:
-            case shape::int32_type:
-            case shape::int64_type:
-            case shape::uint32_type:
-            case shape::uint64_type:
-            case shape::tuple_type: break;
-            }
-            return nullptr;
-        };
-
-        const auto* inp_t_s = get_type_str(inp_t);
-        const auto* flt_t_s = get_type_str(flt_t);
-        const auto* out_t_s = get_type_str(out_t);
-
-        if(out_t_s == nullptr || inp_t_s == nullptr || flt_t_s == nullptr)
-            return result;
-
-        std::string mlir_options = "--kernel_name " + std::string(mlir_kernel_name);
-
-        // platform spec
-        auto& device = get_context().get_current_device();
-        char dev_name[64];
-        sprintf(dev_name, "gfx%lu%02lu", device.get_device_major(), device.get_device_minor());
-        mlir_options += " --arch " + std::string(dev_name) + " --num_cu " +
-                        std::to_string(device.get_cu_count()); // ???
-
-        // Conv spec
-        mlir_options +=
-            " --operation "
-            "conv2d"
-            " --batchsize " +
-            std::to_string(conv.group) + " --groupsize " + std::to_string(1) + " --padding_h " +
-            std::to_string(conv.padding[0]) + " --padding_w " + std::to_string(conv.padding[1]) +
-            " --conv_stride_h " + std::to_string(conv.stride[0]) + " --conv_stride_w " +
-            std::to_string(conv.stride[1]) + " --dilation_h " + std::to_string(conv.dilation[0]) +
-            " --dilation_w " + std::to_string(conv.dilation[1]);
-
-        // Input spec
-        mlir_options += " --in_layout "
-                        "NCHWG"
-                        " --in_type " +
-                        std::string(inp_t_s) + " --in_channels " + std::to_string(inp_t.lens()[1]) +
-                        " --in_h " + std::to_string(inp_t.lens()[2]) + " --in_w " +
-                        std::to_string(inp_t.lens()[3]);
-
-        // Filter spec
-        mlir_options += " --fil_layout "
-                        "NCHWG"
-                        " --fil_type " +
-                        std::string(flt_t_s) + " --fil_h " + std::to_string(flt_t.lens()[2]) +
-                        " --fil_w " + std::to_string(flt_t.lens()[3]);
-
-        // Output spec
-        mlir_options += " --out_layout "
-                        "NCHWG"
-                        " --out_type " +
-                        std::string(out_t_s) + " --out_channels " +
-                        std::to_string(out_t.lens()[1]) + " --out_h " +
-                        std::to_string(out_t.lens()[2]) + " --out_w " +
-                        std::to_string(out_t.lens()[3]);
-
-        auto bin_i = binary_map.find(mlir_options);
-        if(bin_i == binary_map.end())
-        {
-            size_t bin_size = 0;
-
-            using mlir_handle = MIGRAPHX_MANAGE_PTR(MiirHandle, miirDestroyHandle);
-            auto handle       = mlir_handle(miirCreateHandle(mlir_options.c_str()));
-
-            if(miirLowerBin(handle.get()) == MIIR_SUCCESS &&
-               miirBufferGet(handle.get(), nullptr, &bin_size) == MIIR_SUCCESS)
-            {
-                migraphx::value::binary bin(bin_size);
-                if(miirBufferGet(handle.get(), reinterpret_cast<char*>(bin.data()), &bin_size) ==
-                   MIIR_SUCCESS)
-                {
-                    size_t global_size;
-                    size_t block_size;
-                    if(miirGetExecutionDims(handle.get(), &global_size, &block_size) ==
-                       MIIR_SUCCESS)
-                    {
-                        result = std::make_shared<execution_spec>(
-                            std::move(bin), global_size, block_size);
-                    }
-                }
-            }
-
-            binary_map[mlir_options] = result;
-        }
-        else
-        {
-            result = bin_i->second;
-        }
-#else  // MIGRAPHX_MLIR_MIOPEN_SUPPORT
-        (void)op_r;
-#endif // MIGRAPHX_MLIR_MIOPEN_SUPPORT
-        return result;
-    }
-
-    instruction_ref get_literal(uint64_t value)
-    {
-        auto fi = literal_map.find(value);
-        if(fi != literal_map.end())
-            return fi->second;
-        auto lit = mod->add_literal(value);
-        literal_map.emplace(value, lit);
-        return lit;
-    }
-
-    operation make_code_object_op(instruction_ref op_r, const std::shared_ptr<execution_spec>& spec)
-    {
-        // each pointer is expanded out to a MemRefDescriptor
-        auto inp_t = op_r->inputs().at(0)->get_shape();
-        auto flt_t = op_r->inputs().at(1)->get_shape();
-        auto out_t = op_r->get_shape();
-
-        auto i64 = shape(shape::uint64_type);
-
-        std::vector<shape> expected_inputs = {
-            flt_t, flt_t, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,  i64,   inp_t,
-            inp_t, i64,   i64, i64, i64, i64, i64, i64, i64, i64, i64, i64,  out_t, out_t,
-            i64,   i64,   i64, i64, i64, i64, i64, i64, i64, i64, i64, out_t};
-
-        return migraphx::make_op("gpu::code_object",
-                                 {
-                                     {"code_object", spec->binary},
-                                     {"symbol_name", mlir_kernel_name},
-                                     {"global", spec->global_size},
-                                     {"local", spec->local_size},
-                                     {"expected_inputs", migraphx::to_value(expected_inputs)},
-                                     {"output", migraphx::to_value(out_t)},
-                                 });
-    }
-
-    void add_memref_descriptor(std::vector<instruction_ref>& refs, instruction_ref inst)
-    {
-        const size_t offset = 0;
-        auto inst_t         = inst->get_shape();
-        refs.push_back(inst);
-        refs.push_back(inst);
-        refs.push_back(get_literal(offset)); // offset
-
-        // dim sizes
-        std::transform(inst_t.lens().begin(),
-                       inst_t.lens().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        refs.push_back(get_literal(1)); // G
-
-        // dim strides
-        std::transform(inst_t.strides().begin(),
-                       inst_t.strides().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        refs.push_back(get_literal(1)); // G
-    }
-
-    instruction_ref insert_allocation(instruction_ref ins, const shape& s) const
-    {
-        return mod->insert_instruction(ins, hip_allocate{s});
-    }
-
-    void replace_conv_op(instruction_ref ins)
-    {
-        auto conv_bin = make_mlir_binary(ins);
-        if(conv_bin)
-        {
-            auto conv = make_code_object_op(ins, conv_bin);
-
-            auto inp = ins->inputs().at(0);
-            auto flt = ins->inputs().at(1);
-            auto out = insert_allocation(ins, ins->get_shape());
-
-            std::vector<instruction_ref> refs;
-            refs.reserve(3 * 13 + 1);
-            add_memref_descriptor(refs, flt);
-            add_memref_descriptor(refs, inp);
-            add_memref_descriptor(refs, out);
-            refs.push_back(out);
-
-            mod->replace_instruction(ins, conv, refs);
-        }
-    }
-
-    void apply()
-    {
-        init();
-        for(auto it : iterator_for(*mod))
-        {
-            if(it->name() == "convolution")
-            {
-                replace_conv_op(it);
-            }
-        }
-    }
-};
-
-void mlir_conv::apply(module& m) const { mlir_apply{&m, this}.apply(); }
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -53,10 +53,10 @@
 #include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/fuse_mlir.hpp>
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/gpu/prefuse_ops.hpp>
 #include <migraphx/gpu/lowering.hpp>
-#include <migraphx/gpu/mlir_conv.hpp>
 #include <migraphx/gpu/pack_int8_args.hpp>
 #include <migraphx/gpu/schedule_model.hpp>
 #include <migraphx/gpu/sync_device.hpp>
@@ -128,7 +128,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
        dead_code_elimination{},
-        mlir_conv{&ctx},
+        fuse_mlir{&ctx},
+        dead_code_elimination{},
        lowering{&ctx, options.offload_copy},
        eliminate_contiguous{"gpu::contiguous"},
        dead_code_elimination{},

--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/gpu/mlir.hpp>
+#include <migraphx/gpu/target.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/write_literals.hpp>
+#include <migraphx/ref/target.hpp>
+#include <migraphx/module.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/verify_args.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/functional.hpp>
+#include <test.hpp>
+
+using migraphx::trim;
+
+// m test_gpu_mlir && ./bin/test_gpu_mlir
+
+struct mlir_gpu_target : migraphx::gpu::target
+{
+    std::string name() const { return "mlir"; }
+    std::vector<migraphx::pass> get_passes(migraphx::context& gctx,
+                                           const migraphx::compile_options&) const
+    {
+        auto& ctx = migraphx::any_cast<migraphx::gpu::context>(gctx);
+        return {migraphx::gpu::write_literals{&ctx}};
+    }
+};
+
+std::string encode(const std::string& s)
+{
+    std::stringstream ss;
+    bool prespace = false;
+    for(auto c : s)
+    {
+        if(std::isspace(c) != 0)
+        {
+            if(not prespace)
+                ss << "  ";
+            prespace = true;
+        }
+        else if(std::isprint(c) != 0)
+        {
+            ss << c;
+            prespace = false;
+        }
+    }
+    return migraphx::trim(ss.str());
+}
+
+migraphx::program create_program_from_mlir(const migraphx::module& mmlir)
+{
+    migraphx::program p;
+    auto* mm   = p.get_main_module();
+    auto names = mmlir.get_parameter_names();
+    std::vector<migraphx::instruction_ref> inputs;
+    std::transform(names.begin(), names.end(), std::back_inserter(inputs), [&](const auto& name) {
+        return mm->add_parameter(name, mmlir.get_parameter_shape(name));
+    });
+    std::sort(inputs.begin(), inputs.end(), migraphx::by(std::less<>{}, [](auto ins) {
+                  return to_string(ins->get_operator());
+              }));
+    inputs.push_back(mm->add_parameter("output", mmlir.get_output_shapes().front()));
+
+    migraphx::gpu::context ctx;
+    migraphx::gpu::insert_mlir(*mm, mm->end(), compile_mlir(ctx, mmlir), inputs);
+    return p;
+}
+
+migraphx::parameter_map generate_params(const migraphx::program& p)
+{
+    migraphx::parameter_map m;
+    std::size_t i = 0;
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        // m[x.first] = migraphx::fill_argument(x.second, 1);
+        m[x.first] = migraphx::generate_argument(x.second, i++);
+    }
+    return m;
+}
+
+migraphx::argument run_gpu(migraphx::program p, const migraphx::parameter_map& inputs)
+{
+    mlir_gpu_target t;
+    p.compile(t);
+    migraphx::parameter_map m;
+    for(auto&& input : inputs)
+    {
+        m[input.first] = t.copy_to(input.second);
+    }
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        if(m.count(x.first) == 0)
+        {
+            m[x.first] = t.allocate(x.second);
+        }
+    }
+    return t.copy_from(p.eval(m).front());
+}
+
+migraphx::argument run_ref(migraphx::program p, const migraphx::parameter_map& inputs)
+{
+    p.compile(migraphx::ref::target{});
+    return p.eval(inputs).front();
+}
+
+bool verify_mlir(const migraphx::module& mmlir)
+{
+    migraphx::program ref;
+    ref.get_main_module()->insert_instructions(ref.get_main_module()->end(), &mmlir);
+
+    auto inputs = generate_params(ref);
+
+    auto mlir = create_program_from_mlir(mmlir);
+    return migraphx::verify_args("mlir", run_ref(ref, inputs), run_gpu(mlir, inputs));
+}
+
+TEST_CASE(conv)
+{
+    const std::string mlir_output = R"__migraphx__(
+module {
+  func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
+    return %0 : tensor<1x2x2x2xf32>
+  }
+}
+)__migraphx__";
+    migraphx::module m;
+    auto x    = m.add_parameter("x", {migraphx::shape::float_type, {1, 8, 4, 4}});
+    auto w    = m.add_parameter("w", {migraphx::shape::float_type, {2, 8, 3, 3}});
+    auto conv = m.add_instruction(migraphx::make_op("convolution"), x, w);
+    m.add_return({conv});
+    auto s = migraphx::gpu::dump_mlir(m);
+    // Skip test if MLIR is not enabled
+    if(s.empty())
+        return;
+    CHECK(encode(s) == encode(mlir_output));
+    EXPECT(verify_mlir(m));
+}
+
+TEST_CASE(conv_add_relu)
+{
+    const std::string mlir_output = R"__migraphx__(
+module {
+  func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
+    %1 = migraphx.add(%0, %arg0) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+    %2 = migraphx.relu(%1) : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
+    return %2 : tensor<1x2x2x2xf32>
+  }
+}
+)__migraphx__";
+    migraphx::module m;
+    auto x    = m.add_parameter("x", {migraphx::shape::float_type, {1, 8, 4, 4}});
+    auto w    = m.add_parameter("w", {migraphx::shape::float_type, {2, 8, 3, 3}});
+    auto b    = m.add_parameter("b", {migraphx::shape::float_type, {1, 2, 2, 2}});
+    auto conv = m.add_instruction(migraphx::make_op("convolution"), x, w);
+    auto add  = m.add_instruction(migraphx::make_op("add"), conv, b);
+    auto relu = m.add_instruction(migraphx::make_op("relu"), add);
+    m.add_return({relu});
+    auto s = migraphx::gpu::dump_mlir(m);
+    // Skip test if MLIR is not enabled
+    if(s.empty())
+        return;
+    CHECK(encode(s) == encode(mlir_output));
+    EXPECT(verify_mlir(m));
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/module_test.cpp
+++ b/test/module_test.cpp
@@ -300,6 +300,96 @@ TEST_CASE(parameter_name_order)
    EXPECT(param_names == names1);
 }

+TEST_CASE(insert_instructions_module)
+{
+    migraphx::shape s{migraphx::shape::int32_type, {1}};
+    migraphx::module m1("m1");
+    auto x1   = m1.add_parameter("x1", s);
+    auto sqrt = m1.add_instruction(migraphx::make_op("sqrt"), {x1});
+    m1.add_instruction(migraphx::make_op("add"), {sqrt, x1});
+
+    migraphx::module m2("m2");
+    auto x2 = m2.add_parameter("x2", s);
+    m2.add_instruction(migraphx::make_op("sqrt"), {x2});
+
+    m1.insert_instructions(sqrt, &m2, {{x2, x1}});
+
+    EXPECT(std::prev(sqrt)->name() == "sqrt");
+    EXPECT(std::count_if(m1.begin(), m1.end(), [](auto&& ins) { return ins.name() == "sqrt"; }) ==
+           2);
+    EXPECT(std::count_if(m1.begin(), m1.end(), [](auto&& ins) { return ins.name() == "@param"; }) ==
+           1);
+    EXPECT(contains(m1.get_parameter_shapes(), "x1"));
+    EXPECT(not contains(m1.get_parameter_shapes(), "x2"));
+}
+
+TEST_CASE(add_instructions_module)
+{
+    migraphx::shape s{migraphx::shape::int32_type, {1}};
+    migraphx::module m1("m1");
+    auto x1 = m1.add_parameter("x1", s);
+    m1.add_instruction(migraphx::make_op("sqrt"), {x1});
+
+    migraphx::module m2("m2");
+    auto x2 = m2.add_parameter("x2", s);
+    m2.add_instruction(migraphx::make_op("sqrt"), {x2});
+
+    m1.add_instructions(&m2, {{x2, x1}});
+
+    EXPECT(std::count_if(m1.begin(), m1.end(), [](auto&& ins) { return ins.name() == "sqrt"; }) ==
+           2);
+    EXPECT(std::count_if(m1.begin(), m1.end(), [](auto&& ins) { return ins.name() == "@param"; }) ==
+           1);
+    EXPECT(contains(m1.get_parameter_shapes(), "x1"));
+    EXPECT(not contains(m1.get_parameter_shapes(), "x2"));
+}
+
+TEST_CASE(add_instructions_range)
+{
+    migraphx::shape s{migraphx::shape::int32_type, {1}};
+    migraphx::module m1("m1");
+    auto x1 = m1.add_parameter("x1", s);
+    m1.add_instruction(migraphx::make_op("sqrt"), {x1});
+
+    migraphx::module m2("m2");
+    auto x2    = m2.add_parameter("x2", s);
+    auto sqrt2 = m2.add_instruction(migraphx::make_op("sqrt"), {x2});
+
+    m1.add_instructions(sqrt2, m2.end(), {{x2, x1}});
+    EXPECT(std::any_of(
+        m1.begin(), m1.end(), [&](auto&& ins) { return migraphx::contains(ins.inputs(), x1); }));
+
+    EXPECT(std::count_if(m1.begin(), m1.end(), [](auto&& ins) { return ins.name() == "sqrt"; }) ==
+           2);
+    EXPECT(std::count_if(m1.begin(), m1.end(), [](auto&& ins) { return ins.name() == "@param"; }) ==
+           1);
+    EXPECT(contains(m1.get_parameter_shapes(), "x1"));
+    EXPECT(not contains(m1.get_parameter_shapes(), "x2"));
+}
+
+TEST_CASE(add_instructions_vector)
+{
+    migraphx::shape s{migraphx::shape::int32_type, {1}};
+    migraphx::module m1("m1");
+    auto x1 = m1.add_parameter("x1", s);
+    m1.add_instruction(migraphx::make_op("sqrt"), {x1});
+
+    migraphx::module m2("m2");
+    auto x2    = m2.add_parameter("x2", s);
+    auto sqrt2 = m2.add_instruction(migraphx::make_op("sqrt"), {x2});
+
+    m1.add_instructions({sqrt2}, {{x2, x1}});
+    EXPECT(std::any_of(
+        m1.begin(), m1.end(), [&](auto&& ins) { return migraphx::contains(ins.inputs(), x1); }));
+
+    EXPECT(std::count_if(m1.begin(), m1.end(), [](auto&& ins) { return ins.name() == "sqrt"; }) ==
+           2);
+    EXPECT(std::count_if(m1.begin(), m1.end(), [](auto&& ins) { return ins.name() == "@param"; }) ==
+           1);
+    EXPECT(contains(m1.get_parameter_shapes(), "x1"));
+    EXPECT(not contains(m1.get_parameter_shapes(), "x2"));
+}
+
 struct check_for_pass_op
 {
    bool* found = nullptr;

--- a/test/reduce_dims.cpp
+++ b/test/reduce_dims.cpp
@@ -23,6 +23,7 @@
 */
 #include <migraphx/reduce_dims.hpp>
 #include <migraphx/permutation.hpp>
+#include <migraphx/ranges.hpp>
 #include "test.hpp"

 migraphx::shape make_shape(std::vector<std::size_t> lens)
@@ -35,6 +36,21 @@ migraphx::shape make_shape(std::vector<std::size_t> lens, std::vector<std::size_
    return {migraphx::shape::float_type, std::move(lens), std::move(strides)};
 }

+bool verify_shape(const migraphx::shape& s1, const migraphx::shape& s2)
+{
+    if(s1.elements() != s2.elements())
+        return false;
+    return migraphx::all_of(migraphx::range(s1.elements()),
+                            [&](auto i) { return s1.index(i) == s2.index(i); });
+}
+
+template <class Range1, class Range2>
+bool verify_shapes(const Range1& r1, const Range2& r2)
+{
+    return migraphx::equal(
+        r1, r2, [](const auto& s1, const auto& s2) { return verify_shape(s1, s2); });
+}
+
 TEST_CASE(same_standard)
 {
    auto is                              = make_shape({64, 3, 7, 7});
@@ -42,7 +58,7 @@ TEST_CASE(same_standard)
    std::vector<migraphx::shape> ishapes = {is, is, is};
    std::vector<migraphx::shape> eshapes = {os, os, os};
    auto rshapes                         = migraphx::reduce_dims(ishapes);
-
+    EXPECT(verify_shapes(ishapes, rshapes));
    EXPECT(eshapes == rshapes);
 }

@@ -53,7 +69,7 @@ TEST_CASE(same_broadcast1)
    std::vector<migraphx::shape> ishapes = {is, make_shape({64, 3, 7, 7}, {0, 1, 0, 0}), is};
    std::vector<migraphx::shape> eshapes = {os, make_shape({64, 3, 7 * 7}, {0, 1, 0}), os};
    auto rshapes                         = migraphx::reduce_dims(ishapes);
-
+    EXPECT(verify_shapes(ishapes, rshapes));
    EXPECT(eshapes == rshapes);
 }

@@ -64,7 +80,7 @@ TEST_CASE(same_broadcast2)
    std::vector<migraphx::shape> ishapes = {is, make_shape({64, 3, 8, 7, 7}, {0, 8, 1, 0, 0}), is};
    std::vector<migraphx::shape> eshapes = {os, make_shape({64, 8 * 3, 7 * 7}, {0, 1, 0}), os};
    auto rshapes                         = migraphx::reduce_dims(ishapes);
-
+    EXPECT(verify_shapes(ishapes, rshapes));
    EXPECT(eshapes == rshapes);
 }

@@ -75,7 +91,7 @@ TEST_CASE(same_transposed)
    std::vector<migraphx::shape> ishapes = {is, migraphx::reorder_shape(is, {0, 1, 3, 2}), is};
    std::vector<migraphx::shape> eshapes = {os, migraphx::reorder_shape(os, {0, 2, 1}), os};
    auto rshapes                         = migraphx::reduce_dims(ishapes);
-
+    EXPECT(verify_shapes(ishapes, rshapes));
    EXPECT(eshapes == rshapes);
 }

@@ -86,7 +102,7 @@ TEST_CASE(different_masked1)
    std::vector<migraphx::shape> ishapes = {is, make_shape({1, 3, 1, 1}), is};
    std::vector<migraphx::shape> eshapes = {os, make_shape({1, 3, 1}), os};
    auto rshapes                         = migraphx::reduce_dims(ishapes);
-
+    EXPECT(verify_shapes(ishapes, rshapes));
    EXPECT(eshapes == rshapes);
 }

@@ -98,7 +114,7 @@ TEST_CASE(different_masked2)
        is, make_shape({1, 3, 1, 1}), make_shape({64, 1, 7, 7})};
    std::vector<migraphx::shape> eshapes = {os, make_shape({1, 3, 1}), make_shape({64, 1, 7 * 7})};
    auto rshapes                         = migraphx::reduce_dims(ishapes);
-
+    EXPECT(verify_shapes(ishapes, rshapes));
    EXPECT(eshapes == rshapes);
 }

@@ -128,7 +144,7 @@ TEST_CASE(transposed1)
    std::vector<migraphx::shape> eshapes = {
        make_shape({8, 28, 4, 56 * 56}), make_shape({8, 28, 4, 56 * 56}, {351232, 3136, 87808, 1})};
    auto rshapes = migraphx::reduce_dims(ishapes);
-
+    EXPECT(verify_shapes(ishapes, rshapes));
    EXPECT(eshapes == rshapes);
 }

@@ -137,6 +153,7 @@ TEST_CASE(non_packed_empty1)
    std::vector<migraphx::shape> ishapes = {make_shape({1, 12}, {589824, 64})};
    std::vector<migraphx::shape> eshapes = {make_shape({12}, {64})};
    auto rshapes                         = migraphx::reduce_dims(ishapes);
+    EXPECT(verify_shapes(ishapes, rshapes));
    EXPECT(eshapes == rshapes);
 }

@@ -145,6 +162,7 @@ TEST_CASE(non_packed_empty2)
    std::vector<migraphx::shape> ishapes = {make_shape({12, 1}, {64, 589824})};
    std::vector<migraphx::shape> eshapes = {make_shape({12}, {64})};
    auto rshapes                         = migraphx::reduce_dims(ishapes);
+    EXPECT(verify_shapes(ishapes, rshapes));
    EXPECT(eshapes == rshapes);
 }

@@ -155,6 +173,16 @@ TEST_CASE(single_dim)
    EXPECT(ishapes == rshapes);
 }

+TEST_CASE(step_broadcast_transpose)
+{
+    std::vector<migraphx::shape> ishapes = {make_shape({1, 2, 2, 1}, {0, 0, 3, 6}),
+                                            make_shape({1, 2, 2, 1}, {4, 2, 1, 1})};
+    std::vector<migraphx::shape> eshapes = {make_shape({2, 2}, {0, 3}), make_shape({2, 2}, {2, 1})};
+    auto rshapes                         = migraphx::reduce_dims(ishapes);
+    EXPECT(verify_shapes(ishapes, rshapes));
+    EXPECT(eshapes == rshapes);
+}
+
 TEST_CASE(empty)
 {
    auto rshapes = migraphx::reduce_dims({});

--- a/test/ref_ops_test.cpp
+++ b/test/ref_ops_test.cpp
@@ -3187,6 +3187,80 @@ TEST_CASE(nms_test)
    EXPECT(migraphx::verify_range(result, gold));
 }

+TEST_CASE(nms_transpose1_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {1, 4, 6}};
+    std::vector<float> boxes_vec = {
+        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5,
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,  1.0,  1.0,
+    };
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    auto t_boxes_l       = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto transpose_boxes = mm->add_instruction(
+        migraphx::make_op("transpose", {{"permutation", {0, 2, 1}}}), t_boxes_l);
+    auto r = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
+                                 transpose_boxes,
+                                 scores_l,
+                                 max_out_l,
+                                 iou_threshold,
+                                 score_threshold);
+    mm->add_return({r});
+
+    p.compile(migraphx::ref::target{});
+    auto output = p.eval({}).back();
+    std::vector<int64_t> result;
+    output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    EXPECT(migraphx::verify_range(result, gold));
+}
+
+TEST_CASE(nms_transpose2_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape boxes_s{migraphx::shape::float_type, {4, 1, 6}};
+    std::vector<float> boxes_vec = {
+        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.6, 0.4, 10.5, 10.6, 100.5,
+        1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,  1.0,  1.0,
+    };
+
+    migraphx::shape scores_s{migraphx::shape::float_type, {1, 1, 6}};
+    std::vector<float> scores_vec = {0.9, 0.75, 0.6, 0.95, 0.5, 0.3};
+
+    auto t_boxes_l       = mm->add_literal(migraphx::literal(boxes_s, boxes_vec));
+    auto scores_l        = mm->add_literal(migraphx::literal(scores_s, scores_vec));
+    auto max_out_l       = mm->add_literal(int64_t{4});
+    auto iou_threshold   = mm->add_literal(0.5f);
+    auto score_threshold = mm->add_literal(0.0f);
+
+    auto transpose_boxes = mm->add_instruction(
+        migraphx::make_op("transpose", {{"permutation", {1, 2, 0}}}), t_boxes_l);
+    auto r = mm->add_instruction(migraphx::make_op("nonmaxsuppression", {{"center_point_box", 1}}),
+                                 transpose_boxes,
+                                 scores_l,
+                                 max_out_l,
+                                 iou_threshold,
+                                 score_threshold);
+    mm->add_return({r});
+
+    p.compile(migraphx::ref::target{});
+    auto output = p.eval({}).back();
+    std::vector<int64_t> result;
+    output.visit([&](auto out) { result.assign(out.begin(), out.end()); });
+    std::vector<int64_t> gold = {0, 0, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+    EXPECT(migraphx::verify_range(result, gold));
+}
+
 TEST_CASE(nonzero_test)
 {
    migraphx::program p;

--- a/test/shape_test.cpp
+++ b/test/shape_test.cpp
@@ -200,6 +200,15 @@ TEST_CASE(test_shape_broadcasted5)
    EXPECT(s.broadcasted());
 }

+TEST_CASE(test_shape_step_broadcasted)
+{
+    migraphx::shape s{migraphx::shape::float_type, {2, 2}, {0, 3}};
+    EXPECT(not s.standard());
+    EXPECT(not s.packed());
+    EXPECT(not s.transposed());
+    EXPECT(s.broadcasted());
+}
+
 TEST_CASE(test_shape_default_copy)
 {
    migraphx::shape s1{};